{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example of a (tiny) Jupyter notebook in C : a Regular Expression Matcher\n", "\n", "- See \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## First version\n", "\n", "- Reference: " ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2021-02-16T05:41:37.445385Z", "start_time": "2021-02-16T05:41:37.372967Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "For regexp ab.*d.*f\n", "For text abccdeff, result = 1\n", "For text abcokokazodkaozkdazkdcdeff, result = 1\n", "For text abdf, result = 1\n", "For text abdddffff, result = 1\n", "For text SSZSabdddffffZDZD, result = 1\n", "For text Abccdeff, result = 0\n", "For text aZcokokazodkaozkdazkdcdeff, result = 0\n", "For text abZZf, result = 0\n", "For text XXaZDbZDdZDdZDdZDfZDfZDfZDfXX, result = 0\n" ] } ], "source": [ "#include \n", "\n", "int match(char *regexp, char *text);\n", "int matchhere(char *regexp, char *text);\n", "int matchstar(int c, char *regexp, char *text);\n", "\n", "/* match: search for regexp anywhere in text */\n", "int match(char *regexp, char *text)\n", "{\n", " if (regexp[0] == '^')\n", " return matchhere(regexp+1, text);\n", " do { /* must look even if string is empty */\n", " if (matchhere(regexp, text))\n", " return 1;\n", " } while (*text++ != '\\0');\n", " return 0;\n", "}\n", "\n", "/* matchhere: search for regexp at beginning of text */\n", "int matchhere(char *regexp, char *text)\n", "{\n", " if (regexp[0] == '\\0')\n", " return 1;\n", " if (regexp[1] == '*')\n", " return matchstar(regexp[0], regexp+2, text);\n", " if (regexp[0] == '$' && regexp[1] == '\\0')\n", " return *text == '\\0';\n", " if (*text!='\\0' && (regexp[0]=='.' || regexp[0]==*text))\n", " return matchhere(regexp+1, text+1);\n", " return 0;\n", "}\n", "\n", "/* matchstar: search for c*regexp at beginning of text */\n", "int matchstar(int c, char *regexp, char *text)\n", "{\n", " do { /* a * matches zero or more instances */\n", " if (matchhere(regexp, text))\n", " return 1;\n", " } while (*text != '\\0' && (*text++ == c || c == '.'));\n", " return 0;\n", "}\n", "\n", "int main(int argc, char *argv) {\n", " char* regexp = \"ab.*d.*f\";\n", " printf(\"For regexp %s\\n\", regexp);\n", " \n", " char* text1 = \"abccdeff\"; // okay\n", " printf(\"For text %s,\", text1);\n", " printf(\" result = %d\\n\", match(regexp, text1));\n", " \n", " char* text2 = \"abcokokazodkaozkdazkdcdeff\"; // okay\n", " printf(\"For text %s,\", text2);\n", " printf(\" result = %d\\n\", match(regexp, text2));\n", " \n", " char* text3 = \"abdf\"; // okay\n", " printf(\"For text %s,\", text3);\n", " printf(\" result = %d\\n\", match(regexp, text3));\n", " \n", " char* text4 = \"abdddffff\"; // okay\n", " printf(\"For text %s,\", text4);\n", " printf(\" result = %d\\n\", match(regexp, text4));\n", " \n", " char* text5 = \"SSZSabdddffffZDZD\"; // okay\n", " printf(\"For text %s,\", text5);\n", " printf(\" result = %d\\n\", match(regexp, text5));\n", " \n", " \n", " char* text11 = \"Abccdeff\"; // pas okay\n", " printf(\"For text %s,\", text11);\n", " printf(\" result = %d\\n\", match(regexp, text11));\n", " \n", " char* text12 = \"aZcokokazodkaozkdazkdcdeff\"; // pas okay\n", " printf(\"For text %s,\", text12);\n", " printf(\" result = %d\\n\", match(regexp, text12));\n", " \n", " char* text13 = \"abZZf\"; // pas okay\n", " printf(\"For text %s,\", text13);\n", " printf(\" result = %d\\n\", match(regexp, text13));\n", " \n", " char* text14 = \"XXaZDbZDdZDdZDdZDfZDfZDfZDfXX\"; // pas okay\n", " printf(\"For text %s,\", text14);\n", " printf(\" result = %d\\n\", match(regexp, text14));\n", "\n", " return 0;\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Next version\n", "\n", "Let's use an online tool, for the $ab.*d.*f$ regexp\n", "\n", "- See " ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2021-02-16T05:46:24.670480Z", "start_time": "2021-02-16T05:46:24.584719Z" }, "code_folding": [ 14, 55, 70, 78, 86, 90 ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Let's try to match the string 'abcokokazodkaozkdazkdcdeff' with the regex 'ab.*d.*f'\n", "Found this match of length 26 starting at offset 0:\n", "abcokokazodkaozkdazkdcdeff\n" ] } ], "source": [ "#include \n", "#include \n", "#include \n", "\n", "#include \n", "#include \n", "#include \n", "#include \n", "#include \n", "#include \n", "\n", "#define GREEDY_BRANCH 1\n", "#define LAZY_BRANCH 0\n", "\n", "int split(int branch1, int branch2){\n", " /* This function is just a fancy way to do a 'fork' call that will split this\n", " process and execute 'branch1', then 'branch2' of the outer 'if' statement\n", " where this function is called. It will also wait sequentially for 'branch1'\n", " to finish before running 'branch2' rather than running both in parallel.\n", " If 'branch1' returns 0 to indicate success, then 'branch2' will never run. */\n", " pid_t child_pid;\n", " pid_t wpid;\n", " int status = 0;\n", "\n", " if((child_pid = fork()) > 0)\n", " while ((wpid = wait(&status)) > 0);/* Wait on child to finish in 'branch1'. */\n", " else\n", " return branch1;/* If this is child, return and contine in 'branch1'. */\n", "\n", " if(WIFEXITED(status)){\n", " if(WEXITSTATUS(status) == 0) /* Match found in 'branch1', exit the parent with 0. */\n", " exit(0);\n", " else\n", " /* Continue on to check second branch... */;\n", " }else{\n", " printf(\"Unexpected Error Case.\\n\");\n", " exit(1);\n", " }\n", "\n", " if((child_pid = fork()) > 0)\n", " while ((wpid = wait(&status)) > 0);/* Wait on child to finish in 'branch2'. */\n", " else\n", " return branch2;/* If this is child, return and contine in 'branch2'. */\n", "\n", " if(WIFEXITED(status)){\n", " if(WEXITSTATUS(status) == 0) /* Match found in 'branch2', exit the parent with 0. */\n", " exit(0);\n", " else\n", " exit(1); /* Neither branch found a match. */\n", " }else{\n", " printf(\"Unexpected Error Case.\\n\");\n", " exit(1);\n", " }\n", "}\n", "\n", "int matches_one_of_these_characters(char c, int num_chars, ...){\n", " va_list arglist;\n", " va_start(arglist, num_chars);\n", " int i;\n", " int found = 0;\n", " for(i = 0; i < num_chars; i++){\n", " int d = (char)va_arg(arglist, int);\n", " if(c == d){\n", " found = 1;\n", " }\n", " }\n", " va_end(arglist);\n", " return found;\n", "}\n", "\n", "int matches_this_character(char available, char required){\n", " if(available == required){\n", " return 1;\n", " }else{\n", " return 0;\n", " }\n", "}\n", "\n", "char safely_get_character(const char * str, int offset, int pos, int str_len){\n", " if(offset + pos < str_len){\n", " return str[offset + pos];\n", " }else{\n", " exit(1); /* End of string reached.*/\n", " }\n", "}\n", "\n", "int is_beginning_of_text(int position_in_string){\n", " return position_in_string == 0;\n", "}\n", "\n", "int is_end_of_text(int position_in_string, int str_len){\n", " return str_len == position_in_string;\n", "}\n", "\n", "/* The following C program is a hard-coded implementation of\n", " the regular expression 'ab.*d.*f'. */\n", "int regex_search_at_offset(const char * str, int str_len, int offset){\n", " int status = 0;\n", " if(fork() > 0){ /* If this is the parent */\n", " while (wait(&status) > 0); /* Wait on child to finish. */\n", " if(WIFEXITED(status)){ /* Then, return the return code of the child. */\n", " return WEXITSTATUS(status);\n", " }else{\n", " printf(\"Unexpected Error Case.\\n\");\n", " exit(1);\n", " }\n", " }else{/* Continue with the search in a child process and send exit code to parent. */\n", " int pos = 0; /* The position we're currently checking in the search string. */\n", " char current_character;\n", " /* Begin our regex search. */\n", " current_character = safely_get_character(str, offset, pos, str_len);\n", " if(matches_this_character(current_character, 'a')){\n", " pos++;\n", " }else{\n", " exit(1);\n", " }\n", " current_character = safely_get_character(str, offset, pos, str_len);\n", " if(matches_this_character(current_character, 'b')){\n", " pos++;\n", " }else{\n", " exit(1);\n", " }\n", " node_8:\n", " if(split(GREEDY_BRANCH, LAZY_BRANCH) > 0){\n", " current_character = safely_get_character(str, offset, pos, str_len);\n", " if(!matches_one_of_these_characters(current_character, 2, '\\n', '\\r')){\n", " pos++;\n", " }else{\n", " exit(1);\n", " }\n", " goto node_8;\n", " }else{\n", " current_character = safely_get_character(str, offset, pos, str_len);\n", " if(matches_this_character(current_character, 'd')){\n", " pos++;\n", " }else{\n", " exit(1);\n", " }\n", " node_5:\n", " if(split(GREEDY_BRANCH, LAZY_BRANCH) > 0){\n", " current_character = safely_get_character(str, offset, pos, str_len);\n", " if(!matches_one_of_these_characters(current_character, 2, '\\n', '\\r')){\n", " pos++;\n", " }else{\n", " exit(1);\n", " }\n", " goto node_5;\n", " }else{\n", " current_character = safely_get_character(str, offset, pos, str_len);\n", " if(matches_this_character(current_character, 'f')){\n", " pos++;\n", " }else{\n", " exit(1);\n", " }\n", " /* We've got a match. */\n", " printf(\"Found this match of length %d starting at offset %d:\\n\", pos, offset);\n", " for(int i = 0; i < pos; i++){\n", " printf(\"%c\", str[offset + i]);\n", " }\n", " printf(\"\\n\");\n", " exit(0);\n", " }\n", " }\n", " }\n", "}\n", "\n", "int main(int argc, char *argv[]){\n", " // For text \"abccdeff\", result = 1\n", " // For text \"abcokokazodkaozkdazkdcdeff\", result = 1\n", " // For text \"abdf\", result = 1\n", " // For text \"abdddffff\", result = 1\n", " // For text \"SSZSabdddffffZDZD\", result = 1\n", " // For text \"Abccdeff\", result = 0\n", " // For text \"aZcokokazodkaozkdazkdcdeff\", result = 0\n", " // For text \"abZZf\", result = 0\n", " // For text \"XXaZDbZDdZDdZDdZDfZDfZDfZDfXX\", result = 0\n", "\n", " /* The string to search. You can change this to anything. */\n", " const char * str = \"abcokokazodkaozkdazkdcdeff\";\n", " int str_len = (int)strlen(str);\n", " printf(\"Let's try to match the string '%s' with the regex 'ab.*d.*f'\\n\", str);\n", " for(int offset = 0; offset < str_len; offset++){\n", " if(regex_search_at_offset(str, str_len, offset) == 0){\n", " exit(0);\n", " }\n", " }\n", " printf(\"No match Found!\\n\");\n", " return 1;\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Conclusion\n", "That's it for today folks!" ] } ], "metadata": { "kernelspec": { "display_name": "C", "language": "c", "name": "c" }, "language_info": { "file_extension": ".c", "mimetype": "text/plain", "name": "c" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }