{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example of a (tiny) Jupyter notebook in C : a Regular Expression Matcher\n",
    "\n",
    "- See <https://github.com/brendan-rius/jupyter-c-kernel>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## First version\n",
    "\n",
    "- Reference: <https://www.cs.princeton.edu/courses/archive/spr09/cos333/beautiful.html>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-16T05:41:37.445385Z",
     "start_time": "2021-02-16T05:41:37.372967Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "For regexp ab.*d.*f\n",
      "For text abccdeff, result = 1\n",
      "For text abcokokazodkaozkdazkdcdeff, result = 1\n",
      "For text abdf, result = 1\n",
      "For text abdddffff, result = 1\n",
      "For text SSZSabdddffffZDZD, result = 1\n",
      "For text Abccdeff, result = 0\n",
      "For text aZcokokazodkaozkdazkdcdeff, result = 0\n",
      "For text abZZf, result = 0\n",
      "For text XXaZDbZDdZDdZDdZDfZDfZDfZDfXX, result = 0\n"
     ]
    }
   ],
   "source": [
    "#include <stdio.h>\n",
    "\n",
    "int match(char *regexp, char *text);\n",
    "int matchhere(char *regexp, char *text);\n",
    "int matchstar(int c, char *regexp, char *text);\n",
    "\n",
    "/* match: search for regexp anywhere in text */\n",
    "int match(char *regexp, char *text)\n",
    "{\n",
    "    if (regexp[0] == '^')\n",
    "        return matchhere(regexp+1, text);\n",
    "    do {    /* must look even if string is empty */\n",
    "        if (matchhere(regexp, text))\n",
    "            return 1;\n",
    "    } while (*text++ != '\\0');\n",
    "    return 0;\n",
    "}\n",
    "\n",
    "/* matchhere: search for regexp at beginning of text */\n",
    "int matchhere(char *regexp, char *text)\n",
    "{\n",
    "    if (regexp[0] == '\\0')\n",
    "        return 1;\n",
    "    if (regexp[1] == '*')\n",
    "        return matchstar(regexp[0], regexp+2, text);\n",
    "    if (regexp[0] == '$' && regexp[1] == '\\0')\n",
    "        return *text == '\\0';\n",
    "    if (*text!='\\0' && (regexp[0]=='.' || regexp[0]==*text))\n",
    "        return matchhere(regexp+1, text+1);\n",
    "    return 0;\n",
    "}\n",
    "\n",
    "/* matchstar: search for c*regexp at beginning of text */\n",
    "int matchstar(int c, char *regexp, char *text)\n",
    "{\n",
    "    do {    /* a * matches zero or more instances */\n",
    "        if (matchhere(regexp, text))\n",
    "            return 1;\n",
    "    } while (*text != '\\0' && (*text++ == c || c == '.'));\n",
    "    return 0;\n",
    "}\n",
    "\n",
    "int main(int argc, char *argv) {\n",
    "    char* regexp = \"ab.*d.*f\";\n",
    "    printf(\"For regexp %s\\n\", regexp);\n",
    "    \n",
    "    char* text1   = \"abccdeff\";  // okay\n",
    "    printf(\"For text %s,\", text1);\n",
    "    printf(\" result = %d\\n\", match(regexp, text1));\n",
    "    \n",
    "    char* text2   = \"abcokokazodkaozkdazkdcdeff\";  // okay\n",
    "    printf(\"For text %s,\", text2);\n",
    "    printf(\" result = %d\\n\", match(regexp, text2));\n",
    "    \n",
    "    char* text3   = \"abdf\";  // okay\n",
    "    printf(\"For text %s,\", text3);\n",
    "    printf(\" result = %d\\n\", match(regexp, text3));\n",
    "    \n",
    "    char* text4   = \"abdddffff\";  // okay\n",
    "    printf(\"For text %s,\", text4);\n",
    "    printf(\" result = %d\\n\", match(regexp, text4));\n",
    "    \n",
    "    char* text5   = \"SSZSabdddffffZDZD\";  // okay\n",
    "    printf(\"For text %s,\", text5);\n",
    "    printf(\" result = %d\\n\", match(regexp, text5));\n",
    "    \n",
    "    \n",
    "    char* text11   = \"Abccdeff\";  // pas okay\n",
    "    printf(\"For text %s,\", text11);\n",
    "    printf(\" result = %d\\n\", match(regexp, text11));\n",
    "    \n",
    "    char* text12   = \"aZcokokazodkaozkdazkdcdeff\";  // pas okay\n",
    "    printf(\"For text %s,\", text12);\n",
    "    printf(\" result = %d\\n\", match(regexp, text12));\n",
    "    \n",
    "    char* text13   = \"abZZf\";  // pas okay\n",
    "    printf(\"For text %s,\", text13);\n",
    "    printf(\" result = %d\\n\", match(regexp, text13));\n",
    "    \n",
    "    char* text14   = \"XXaZDbZDdZDdZDdZDfZDfZDfZDfXX\";  // pas okay\n",
    "    printf(\"For text %s,\", text14);\n",
    "    printf(\" result = %d\\n\", match(regexp, text14));\n",
    "\n",
    "    return 0;\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Next version\n",
    "\n",
    "Let's use an online tool, for the $ab.*d.*f$ regexp\n",
    "\n",
    "- See <https://blog.robertelder.org/regular-expression-visualizer/>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-16T05:46:24.670480Z",
     "start_time": "2021-02-16T05:46:24.584719Z"
    },
    "code_folding": [
     14,
     55,
     70,
     78,
     86,
     90
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Let's try to match the string 'abcokokazodkaozkdazkdcdeff' with the regex 'ab.*d.*f'\n",
      "Found this match of length 26 starting at offset 0:\n",
      "abcokokazodkaozkdazkdcdeff\n"
     ]
    }
   ],
   "source": [
    "#include <stdlib.h>\n",
    "#include <stdio.h>\n",
    "#include <string.h>\n",
    "\n",
    "#include <stdio.h>\n",
    "#include <unistd.h>\n",
    "#include <stdarg.h>\n",
    "#include <stdlib.h>\n",
    "#include <sys/wait.h>\n",
    "#include <string.h>\n",
    "\n",
    "#define GREEDY_BRANCH 1\n",
    "#define LAZY_BRANCH 0\n",
    "\n",
    "int split(int branch1, int branch2){\n",
    "  /*  This function is just a fancy way to do a 'fork' call that will split this\n",
    "      process and execute 'branch1', then 'branch2' of the outer 'if' statement\n",
    "      where this function is called.  It will also wait sequentially for 'branch1'\n",
    "      to finish before running 'branch2' rather than running both in parallel.\n",
    "      If 'branch1' returns 0 to indicate success, then 'branch2' will never run. */\n",
    "  pid_t child_pid;\n",
    "  pid_t wpid;\n",
    "  int status = 0;\n",
    "\n",
    "  if((child_pid = fork()) > 0)\n",
    "    while ((wpid = wait(&status)) > 0);/*  Wait on child to finish in 'branch1'.  */\n",
    "  else\n",
    "    return branch1;/*  If this is child, return and contine in 'branch1'. */\n",
    "\n",
    "  if(WIFEXITED(status)){\n",
    "    if(WEXITSTATUS(status) == 0) /* Match found in 'branch1', exit the parent with 0. */\n",
    "      exit(0);\n",
    "    else\n",
    "      /*  Continue on to check second branch... */;\n",
    "  }else{\n",
    "    printf(\"Unexpected Error Case.\\n\");\n",
    "    exit(1);\n",
    "  }\n",
    "\n",
    "  if((child_pid = fork()) > 0)\n",
    "    while ((wpid = wait(&status)) > 0);/*  Wait on child to finish in 'branch2'.  */\n",
    "  else\n",
    "    return branch2;/*  If this is child, return and contine in 'branch2'. */\n",
    "\n",
    "  if(WIFEXITED(status)){\n",
    "    if(WEXITSTATUS(status) == 0) /* Match found in 'branch2', exit the parent with 0. */\n",
    "      exit(0);\n",
    "    else\n",
    "      exit(1); /*  Neither branch found a match. */\n",
    "  }else{\n",
    "    printf(\"Unexpected Error Case.\\n\");\n",
    "    exit(1);\n",
    "  }\n",
    "}\n",
    "\n",
    "int matches_one_of_these_characters(char c, int num_chars, ...){\n",
    "  va_list arglist;\n",
    "  va_start(arglist, num_chars);\n",
    "  int i;\n",
    "  int found = 0;\n",
    "  for(i = 0; i < num_chars; i++){\n",
    "    int d = (char)va_arg(arglist, int);\n",
    "    if(c == d){\n",
    "      found = 1;\n",
    "    }\n",
    "  }\n",
    "  va_end(arglist);\n",
    "  return found;\n",
    "}\n",
    "\n",
    "int matches_this_character(char available, char required){\n",
    "  if(available == required){\n",
    "    return 1;\n",
    "  }else{\n",
    "    return 0;\n",
    "  }\n",
    "}\n",
    "\n",
    "char safely_get_character(const char * str, int offset, int pos, int str_len){\n",
    "  if(offset + pos < str_len){\n",
    "    return str[offset + pos];\n",
    "  }else{\n",
    "    exit(1); /*  End of string reached.*/\n",
    "  }\n",
    "}\n",
    "\n",
    "int is_beginning_of_text(int position_in_string){\n",
    "  return position_in_string == 0;\n",
    "}\n",
    "\n",
    "int is_end_of_text(int position_in_string, int str_len){\n",
    "  return str_len == position_in_string;\n",
    "}\n",
    "\n",
    "/*  The following C program is a hard-coded implementation of\n",
    "    the regular expression 'ab.*d.*f'.  */\n",
    "int regex_search_at_offset(const char * str, int str_len, int offset){\n",
    "  int status = 0;\n",
    "  if(fork() > 0){  /*  If this is the parent */\n",
    "    while (wait(&status) > 0); /*  Wait on child to finish. */\n",
    "    if(WIFEXITED(status)){  /*  Then, return the return code of the child. */\n",
    "      return WEXITSTATUS(status);\n",
    "    }else{\n",
    "      printf(\"Unexpected Error Case.\\n\");\n",
    "      exit(1);\n",
    "    }\n",
    "  }else{/* Continue with the search in a child process and send exit code to parent. */\n",
    "    int pos = 0; /*  The position we're currently checking in the search string. */\n",
    "    char current_character;\n",
    "    /*  Begin our regex search. */\n",
    "    current_character = safely_get_character(str, offset, pos, str_len);\n",
    "    if(matches_this_character(current_character, 'a')){\n",
    "      pos++;\n",
    "    }else{\n",
    "      exit(1);\n",
    "    }\n",
    "    current_character = safely_get_character(str, offset, pos, str_len);\n",
    "    if(matches_this_character(current_character, 'b')){\n",
    "      pos++;\n",
    "    }else{\n",
    "      exit(1);\n",
    "    }\n",
    "    node_8:\n",
    "    if(split(GREEDY_BRANCH, LAZY_BRANCH) > 0){\n",
    "      current_character = safely_get_character(str, offset, pos, str_len);\n",
    "      if(!matches_one_of_these_characters(current_character, 2, '\\n', '\\r')){\n",
    "        pos++;\n",
    "      }else{\n",
    "        exit(1);\n",
    "      }\n",
    "      goto node_8;\n",
    "    }else{\n",
    "      current_character = safely_get_character(str, offset, pos, str_len);\n",
    "      if(matches_this_character(current_character, 'd')){\n",
    "        pos++;\n",
    "      }else{\n",
    "        exit(1);\n",
    "      }\n",
    "      node_5:\n",
    "      if(split(GREEDY_BRANCH, LAZY_BRANCH) > 0){\n",
    "        current_character = safely_get_character(str, offset, pos, str_len);\n",
    "        if(!matches_one_of_these_characters(current_character, 2, '\\n', '\\r')){\n",
    "          pos++;\n",
    "        }else{\n",
    "          exit(1);\n",
    "        }\n",
    "        goto node_5;\n",
    "      }else{\n",
    "        current_character = safely_get_character(str, offset, pos, str_len);\n",
    "        if(matches_this_character(current_character, 'f')){\n",
    "          pos++;\n",
    "        }else{\n",
    "          exit(1);\n",
    "        }\n",
    "        /*  We've got a match. */\n",
    "        printf(\"Found this match of length %d starting at offset %d:\\n\", pos, offset);\n",
    "        for(int i = 0; i < pos; i++){\n",
    "          printf(\"%c\", str[offset + i]);\n",
    "        }\n",
    "        printf(\"\\n\");\n",
    "        exit(0);\n",
    "      }\n",
    "    }\n",
    "  }\n",
    "}\n",
    "\n",
    "int main(int argc, char *argv[]){\n",
    "  // For text \"abccdeff\", result = 1\n",
    "  // For text \"abcokokazodkaozkdazkdcdeff\", result = 1\n",
    "  // For text \"abdf\", result = 1\n",
    "  // For text \"abdddffff\", result = 1\n",
    "  // For text \"SSZSabdddffffZDZD\", result = 1\n",
    "  // For text \"Abccdeff\", result = 0\n",
    "  // For text \"aZcokokazodkaozkdazkdcdeff\", result = 0\n",
    "  // For text \"abZZf\", result = 0\n",
    "  // For text \"XXaZDbZDdZDdZDdZDfZDfZDfZDfXX\", result = 0\n",
    "\n",
    "  /* The string to search. You can change this to anything. */\n",
    "  const char * str = \"abcokokazodkaozkdazkdcdeff\";\n",
    "  int str_len = (int)strlen(str);\n",
    "  printf(\"Let's try to match the string '%s' with the regex 'ab.*d.*f'\\n\", str);\n",
    "  for(int offset = 0; offset < str_len; offset++){\n",
    "    if(regex_search_at_offset(str, str_len, offset) == 0){\n",
    "      exit(0);\n",
    "    }\n",
    "  }\n",
    "  printf(\"No match Found!\\n\");\n",
    "  return 1;\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conclusion\n",
    "That's it for today folks!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "C",
   "language": "c",
   "name": "c"
  },
  "language_info": {
   "file_extension": ".c",
   "mimetype": "text/plain",
   "name": "c"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}