\usepackage{xparse}
\usepackage{shellesc}

\newcommand{\budouxspecialchar}{} % 0xF002

\ExplSyntaxOn

\iow_new:N \l__orig_file_stream

\cs_new:Npn \__write_orig_file:nn #1 #2 {
  \iow_open:Nn \l__orig_file_stream { #1 }
  \iow_now:Nn \l__orig_file_stream { #2 }
  \iow_close:N \l__orig_file_stream
}

\iow_new:N \l__processed_file_stream
\tl_new:N \l__processed_tl

\cs_new:Npn \__read_processed_file:n #1 {
  \tl_clear:N \l__processed_tl
  \ior_open:Nn \l__processed_file_stream { #1 }
  \ior_map_inline:Nn \l__processed_file_stream {
    \tl_put_right:Nn \l__processed_tl { ##1 }
  }
  \ior_close:N \l__processed_file_stream
}

\int_new:N \l__count_int
\bool_new:N \l__is_alphabet_bool

\cs_new:Npn \__insert_between_chars_rec:nn #1 #2 {
  % \tl_analysis_show:n { #2 }
  \int_incr:N \l__count_int

  \tl_if_empty:nTF { #2 } {} {
    \tl_if_head_is_group:nTF { #2 } {
      \tl_put_right:Nn \l__output_tl { \bgroup }
      \tl_clear_new:N \l__next_group_tl
      \exp_args:Nne \tl_set:Nn \l__next_group_tl { \tl_head:n { #2 } }
      \tl_replace_all:Nnn \l__next_group_tl { ~ } { \c_space_token }
      \exp_args:Nno \__insert_between_chars_rec:nn { #1 } { \l__next_group_tl }
      \tl_put_right:Nn \l__output_tl { \egroup }

      \exp_args:Nne \__insert_between_chars_rec:nn { #1 } { \tl_tail:n { #2 } }
    } {
      \exp_args:Nnx \tl_if_head_eq_charcode:nNTF { #2 } { \char_generate:nn { "F002 } { 12 } } {
        \int_set:Nn \l__count_int { 0 }
        % Skip the space right after the special character (the newlines added by BudouX)
        \exp_args:Nne \__insert_between_chars_rec:nn { #1 } { \exp_args:Ne \tl_tail:n { \tl_tail:n { #2 } } }
      } {
        \tl_if_head_is_space:nTF { #2 } {
          \tl_put_right:Nn \l__output_tl { \tl_head:n { ~ } }
        } {}

        \exp_args:Nnx \regex_match:nnTF { [!\#\$\%&'()*+,-./:;<=>?@[\]^_`{|}~0-9a-zA-Z] } { \tl_head:n { #2 } } {
          \bool_if:nTF {
            \int_compare_p:nNn { \l__count_int } > { 1 } && !\l__is_alphabet_bool
          } {
            \tl_put_right:Nn \l__output_tl { #1 }
          }{}

          \bool_set_true:N \l__is_alphabet_bool
        } {
          \int_compare:nNnTF { \l__count_int } > { 1 } {
            \tl_put_right:Nn \l__output_tl { #1 }
          }{}

          \bool_set_false:N \l__is_alphabet_bool
        }

        \exp_args:NNe \tl_put_right:Nn \l__output_tl { \tl_head:n { #2 } }
        \exp_args:Nne \__insert_between_chars_rec:nn { #1 } { \tl_tail:n { #2 } }
      }
    }
  }
}

\cs_new:Npn \__insert_between_chars:nn #1 #2 {
  \tl_clear_new:N \l__input_tl
  \tl_set:Nn \l__input_tl { #2 }
  \tl_replace_all:Nnn \l__input_tl { ~ } { \c_space_token }

  \int_set:Nn \l__count_int { 0 }
  \bool_set_false:N \l__is_alphabet_bool
  \tl_clear_new:N \l__output_tl
  \exp_args:Nno \__insert_between_chars_rec:nn { #1 } { \l__input_tl }
  \tl_use:N \l__output_tl
}

\cs_new:Npn \__insert_nolinebreak:n #1 {
  \__insert_between_chars:nn { \nolinebreak[3] } { #1 }
}

\newcommand{\budouximpl}[1]{
  \__read_processed_file:n { #1 }
  \exp_args:No \__insert_nolinebreak:n { \l__processed_tl }
}

\newcommand{\outputorigfileimpl}[2]{
  \__write_orig_file:nn { #1 } { #2 }
}

\ExplSyntaxOff

\NewDocumentCommand{\budoux}{O{\raggedright} m}{{%
  \newcommand\origfile{_budoux-\jobname-original.txt}%
  \newcommand\processedfile{_budoux-\jobname-processed.txt}%
  \outputorigfileimpl{\origfile}{#2}
  \ShellEscape{export LC_ALL=C.UTF-8; cat \origfile | budoux | sed 's/$/\budouxspecialchar/' > \processedfile}%
  #1%
  \budouximpl{\processedfile}\par%
  \ShellEscape{rm \origfile}%
  \ShellEscape{rm \processedfile}%
}}