{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# My own tiny similarity and plagiarism detector in Python scikit-learn\n", "\n", "- I want to try to have a small similarity and plagiarism detector, for any files!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## First try\n", "\n", "- Source: " ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:44:21.411567Z", "start_time": "2021-02-21T08:44:21.404864Z" } }, "outputs": [], "source": [ "import os\n", "\n", "extension = 'txt'\n", "extension = 'ml'\n", "extension = 'java'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:44:35.206662Z", "start_time": "2021-02-21T08:44:35.184798Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "students_files: ['Ivann_VYSLANKO.java', 'Ethan_GAUTHIER.java', 'Florian_EPAIN__Salma_BEN_AYAD.java', 'Astrid_PELISSOU.java', 'Clement_DESBROUSSES__Dorian_LAHOCHE_MA1-2.java', 'Adélaïde_MONTEMBAULT__Mealig_LE_GUEVEL_MA1-2.java', 'Mohamed__MOUHIMINE_MA1-2.java', 'Enzo_LEGUY__Albane_CHALLAMEL.java', 'Axel_ALLAIN.java', 'Enzo_SOLDI.java', 'Remi_CAZOULAT.java', 'Andrea-Karol-JAKUBOWSKI.java', 'Louis_LIEUTAUD.java', 'Lena_ARHUIS.java', 'EOuann_AUBRY__Mathias_SALDANHA.java', 'Nouhou_OUSSENI__Emmanuel_LE_PANNERER.java', 'Yoan_PETTORELLI.java', 'Thomas_DERRIEN.java', 'Ariane_NICOLAS.java', 'Yasmine_TELLACHE.java', 'Florian_EPAIN.java', 'Lea_AUBRY__Iska_LE_MENN.java', 'Naoufel_GIRARD_MA1-2.java', 'Ryan_BORCHANI__Ael_COIC.java', 'Tom_CHAUVEAU.java', 'Yann_BALLANGER__Abel_LOCOCGUEN_MA1-2.java', 'Theo_LE_GOC.java', 'Gabriel_STIERER__Romain_SINIC_MA1-2.java', 'Bouchra_BOUSSIF__Youssouf_DIAKITE.java', 'Divi_SINQUIN__Alexane_FAISANT.java', 'Amelie_BREJOT__Clemence_BOUVIER_MA1-2.java', 'Mathurin_GESNY__Jean_MARIN_MA1-2.java', 'Vincent_FARAD.java', 'Camille_GOURVELLEC_Flavie_BANNIER.java', 'Yvan_LEFEVRE.java', 'Antonin_FAGAT.java', 'Clemence_CAVEY__Waly_MOYSE.java', 'Ewen_HEINRY.java', 'Mathilde_LECUYER_Axelle_LE_GUENNEC.java', 'Julien_BOYER.java', 'Nicolas_LE_GUERN__Flavien_LEBRET.java', 'Jean_MARIN__Mathurin_GESNY_MA1-2.java', 'Clement_GESTIN__Enzo_SOLDI.java', 'Dimitri_BERGEAULT.java', 'Asmaa_NAZIH__Manon_CADET.java']\n" ] } ], "source": [ "student_files = [doc for doc in os.listdir() if doc.endswith(f'.{extension}')]\n", "print(\"students_files:\", student_files)\n", "\n", "student_notes =[open(File).read() for File in student_files]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:44:51.143453Z", "start_time": "2021-02-21T08:44:51.135819Z" } }, "outputs": [ { "data": { "text/plain": [ "14785" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(student_notes)\n", "len(student_notes[0])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:46:00.252694Z", "start_time": "2021-02-21T08:45:59.618510Z" } }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:46:01.177188Z", "start_time": "2021-02-21T08:46:01.072879Z" } }, "outputs": [], "source": [ "vectorize = lambda Text: TfidfVectorizer().fit_transform(Text).toarray()\n", "similarity = lambda doc1, doc2: cosine_similarity([doc1, doc2])\n", "\n", "vectors = vectorize(student_notes)\n", "s_vectors = list(zip(student_files, vectors))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:46:02.072798Z", "start_time": "2021-02-21T08:46:02.055681Z" } }, "outputs": [], "source": [ "def check_plagiarism(s_vectors):\n", " plagiarism_results = set()\n", " for student_a, text_vector_a in s_vectors:\n", " new_vectors =s_vectors.copy()\n", " current_index = new_vectors.index((student_a, text_vector_a))\n", " del new_vectors[current_index]\n", " for student_b , text_vector_b in new_vectors:\n", " sim_score = similarity(text_vector_a, text_vector_b)[0][1]\n", " student_pair = sorted((student_a, student_b))\n", " score = (student_pair[0], student_pair[1],sim_score)\n", " plagiarism_results.add(score)\n", " return plagiarism_results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let's sort by increasing similarity index:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:50:04.853108Z", "start_time": "2021-02-21T08:50:04.839648Z" } }, "outputs": [], "source": [ "def sort_plagiarism(s_vectors):\n", " data_plagiarism = check_plagiarism(s_vectors)\n", " sorted_data = sorted(data_plagiarism, key=lambda n1n2score: n1n2score[::-1])\n", " return sorted_data" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:48:20.997351Z", "start_time": "2021-02-21T08:48:20.980053Z" } }, "source": [ "And then filter also:" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:50:25.920603Z", "start_time": "2021-02-21T08:50:25.910470Z" } }, "outputs": [], "source": [ "def filter_plagiarism(sorted_data, threshold=0.70):\n", " return [ scoren1n2 for scoren1n2 in sorted_data if scoren1n2[-1] >= threshold ]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2021-02-21T08:55:37.154850Z", "start_time": "2021-02-21T08:55:36.824135Z" }, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Files Adéla and Cleme have similarity = 70.02%\n", "Files Adéla and Enzo_ have similarity = 70.02%\n", "Files Camil and Tom_C have similarity = 70.04%\n", "Files Bouch and Nicol have similarity = 70.13%\n", "Files Bouch and Lea_A have similarity = 70.22%\n", "Files Louis and Thoma have similarity = 70.33%\n", "Files Thoma and Yvan_ have similarity = 70.33%\n", "Files Adéla and Bouch have similarity = 70.34%\n", "Files Ameli and Ewen_ have similarity = 70.46%\n", "Files Dimit and EOuan have similarity = 70.55%\n", "Files Julie and Lena_ have similarity = 70.65%\n", "Files Gabri and Ryan_ have similarity = 70.67%\n", "Files Camil and Gabri have similarity = 70.74%\n", "Files Thoma and Tom_C have similarity = 70.76%\n", "Files Nicol and Ryan_ have similarity = 71.04%\n", "Files Astri and Nicol have similarity = 71.05%\n", "Files Adéla and Dimit have similarity = 71.13%\n", "Files Astri and Divi_ have similarity = 71.19%\n", "Files Naouf and Yann_ have similarity = 71.28%\n", "Files Flori and Flori have similarity = 71.34%\n", "Files Andre and Astri have similarity = 71.39%\n", "Files Andre and Louis have similarity = 71.44%\n", "Files Andre and Yvan_ have similarity = 71.44%\n", "Files Camil and Dimit have similarity = 71.48%\n", "Files Dimit and Ryan_ have similarity = 71.69%\n", "Files Thoma and Yasmi have similarity = 71.72%\n", "Files Bouch and Camil have similarity = 71.74%\n", "Files Bouch and Julie have similarity = 71.75%\n", "Files Divi_ and Yasmi have similarity = 71.79%\n", "Files Astri and Louis have similarity = 71.82%\n", "Files Astri and Yvan_ have similarity = 71.82%\n", "Files Andre and Tom_C have similarity = 71.90%\n", "Files Astri and Tom_C have similarity = 71.91%\n", "Files Julie and Yann_ have similarity = 71.91%\n", "Files Adéla and Divi_ have similarity = 72.04%\n", "Files Dimit and Nicol have similarity = 72.05%\n", "Files Dimit and Lea_A have similarity = 72.05%\n", "Files Nicol and Yasmi have similarity = 72.11%\n", "Files Divi_ and Julie have similarity = 72.14%\n", "Files EOuan and Nicol have similarity = 72.23%\n", "Files Nicol and Yann_ have similarity = 72.28%\n", "Files Ameli and Naouf have similarity = 72.40%\n", "Files Camil and Julie have similarity = 72.49%\n", "Files Adéla and Naouf have similarity = 72.49%\n", "Files Ameli and Nouho have similarity = 72.75%\n", "Files Gabri and Yann_ have similarity = 72.78%\n", "Files Camil and Nicol have similarity = 72.89%\n", "Files Divi_ and Nicol have similarity = 72.89%\n", "Files Ameli and Gabri have similarity = 73.23%\n", "Files Camil and Divi_ have similarity = 73.30%\n", "Files Ameli and Bouch have similarity = 73.31%\n", "Files Lea_A and Ryan_ have similarity = 73.37%\n", "Files EOuan and Lea_A have similarity = 73.78%\n", "Files Adéla and Julie have similarity = 73.79%\n", "Files Lea_A and Yann_ have similarity = 73.83%\n", "Files Divi_ and Ryan_ have similarity = 74.01%\n", "Files Astri and Yasmi have similarity = 74.16%\n", "Files Adéla and Yann_ have similarity = 74.18%\n", "Files Ameli and Divi_ have similarity = 74.19%\n", "Files Bouch and Yann_ have similarity = 74.24%\n", "Files Divi_ and Lea_A have similarity = 74.27%\n", "Files Julie and Lea_A have similarity = 74.37%\n", "Files EOuan and Naouf have similarity = 74.37%\n", "Files Bouch and Ryan_ have similarity = 74.50%\n", "Files Camil and Yann_ have similarity = 74.60%\n", "Files Naouf and Ryan_ have similarity = 74.70%\n", "Files Adéla and EOuan have similarity = 74.82%\n", "Files Camil and Naouf have similarity = 74.83%\n", "Files Divi_ and Tom_C have similarity = 75.00%\n", "Files Lea_A and Tom_C have similarity = 75.08%\n", "Files Camil and EOuan have similarity = 75.23%\n", "Files Divi_ and Louis have similarity = 75.24%\n", "Files Divi_ and Yvan_ have similarity = 75.24%\n", "Files Lea_A and Louis have similarity = 75.38%\n", "Files Lea_A and Yvan_ have similarity = 75.38%\n", "Files Ryan_ and Yann_ have similarity = 75.43%\n", "Files Ameli and Louis have similarity = 75.45%\n", "Files Ameli and Yvan_ have similarity = 75.45%\n", "Files Adéla and Camil have similarity = 75.45%\n", "Files Ameli and Julie have similarity = 75.51%\n", "Files Ameli and EOuan have similarity = 75.71%\n", "Files Adéla and Lea_A have similarity = 75.76%\n", "Files Andre and Yasmi have similarity = 75.84%\n", "Files Nouho and Ryan_ have similarity = 76.18%\n", "Files Louis and Yasmi have similarity = 76.23%\n", "Files Yasmi and Yvan_ have similarity = 76.23%\n", "Files Lea_A and Nicol have similarity = 76.26%\n", "Files Dimit and Yann_ have similarity = 76.30%\n", "Files Tom_C and Yasmi have similarity = 76.31%\n", "Files Camil and Lea_A have similarity = 76.52%\n", "Files Louis and Nicol have similarity = 76.64%\n", "Files Nicol and Yvan_ have similarity = 76.64%\n", "Files Ameli and Tom_C have similarity = 76.69%\n", "Files Ameli and Nicol have similarity = 76.84%\n", "Files Ameli and Camil have similarity = 76.99%\n", "Files EOuan and Ryan_ have similarity = 77.02%\n", "Files Nicol and Tom_C have similarity = 77.25%\n", "Files Julie and Ryan_ have similarity = 77.31%\n", "Files Adéla and Ryan_ have similarity = 77.62%\n", "Files EOuan and Yann_ have similarity = 77.84%\n", "Files Camil and Ryan_ have similarity = 78.16%\n", "Files Ameli and Yann_ have similarity = 78.26%\n", "Files Ameli and Dimit have similarity = 78.45%\n", "Files Adéla and Ameli have similarity = 78.72%\n", "Files Bouch and EOuan have similarity = 78.80%\n", "Files EOuan and Julie have similarity = 79.47%\n", "Files Ameli and Lea_A have similarity = 80.79%\n", "Files Ameli and Ryan_ have similarity = 80.87%\n", "Files Jean_ and Mathu have similarity = 97.16%\n", "Files Ewen_ and Ivann have similarity = 98.14%\n", "Files Louis and Tom_C have similarity = 98.99%\n", "Files Tom_C and Yvan_ have similarity = 98.99%\n", "Files Axel_ and Vince have similarity = 99.08%\n", "Files Cleme and Enzo_ have similarity = 100.00%\n", "Files Arian and Yoan_ have similarity = 100.00%\n", "Files Louis and Yvan_ have similarity = 100.00%\n", "Files Remi_ and Theo_ have similarity = 100.00%\n" ] } ], "source": [ "for n1, n2, score in filter_plagiarism(sort_plagiarism(s_vectors)):\n", " name1 = n1.replace(f'.{extension}', '')[:5]\n", " name2 = n2.replace(f'.{extension}', '')[:5]\n", " print(f\"Files {name1} and {name2} have similarity = {score:.2%}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It's already pretty good!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }