{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from textblob import Word" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 GPRPy\n", "1 Simplemost installation\n", "2 Running the software\n", "3 Running automatically generated scripts\n", "4 In case of trouble\n", "5 Uninstalling GPRPy\n", "6 News\n", "7 Scale-recurrent Network for Deep Image Deblur...\n", "8 Our results on real data\n", "9 Results on the testing dataset\n", "10 More cases on real photos from previous papers:\n", "11 Prerequisites\n", "12 Installation\n", "13 Testing\n", "14 Evaluation\n", "15 Training\n", "16 Models\n", "17 How to choose\n", "18 Reference\n", "19 Contact\n", "20 Reference\n", "21 connect to the API\n", "22 download single scene by known product id\n", "23 search by polygon, time, and Hub query keywords\n", "24 download all results from the search\n", "25 GeoJSON FeatureCollection containing footprin...\n", "26 GeoPandas GeoDataFrame with the metadata of t...\n", "27 Get basic information about the product: its ...\n", "28 its download url\n", "29 Get the product's full metadata available on ...\n", " ... \n", "831 Local config\n", "832 Example of a json file\n", "833 Example of a js module\n", "834 Custom renderers\n", "835 Known plugins\n", "836 pyGeoPressure -->\n", "837 Features\n", "838 Getting Started\n", "839 Installation\n", "840 Example\n", "841 Pore Pressure Prediction using well log data\n", "842 Documentation\n", "843 Contribute\n", "844 Report Bugs\n", "845 Suggest Enhancements\n", "846 Submit Pull Requests\n", "847 Support\n", "848 License\n", "849 News!\n", "850 Contents\n", "851 Introduction\n", "852 Requirements\n", "853 Quick Start\n", "854 Training\n", "855 Evaluation \n", "856 Prediction\n", "857 Baseline\n", "858 Contributors\n", "859 Citation\n", "860 License\n", "Name: Header, Length: 861, dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "headers = pd.read_csv(\"header.csv\")\n", "headers['Header']" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "citation = [Word(\"citation\").synsets[2], Word(\"reference\").synsets[1], Word(\"cite\").synsets[3]]\n", "run = [Word(\"run\").synsets[9],Word(\"run\").synsets[34],Word(\"execute\").synsets[4]]\n", "install = [Word(\"installation\").synsets[0],Word(\"install\").synsets[0],Word(\"setup\").synsets[1],Word(\"prepare\").synsets[0],Word(\"preparation\").synsets[0],Word(\"manual\").synsets[0],Word(\"guide\").synsets[2],Word(\"guide\").synsets[9]]\n", "download = [Word(\"download\").synsets[0]]\n", "requirement = [Word(\"requirement\").synsets[2],Word(\"prerequisite\").synsets[0],Word(\"prerequisite\").synsets[1],Word(\"dependency\").synsets[0],Word(\"dependent\").synsets[0]]\n", "contact = [Word(\"contact\").synsets[9]]\n", "description = [Word(\"description\").synsets[0],Word(\"description\").synsets[1],Word(\"introduction\").synsets[3],Word(\"introduction\").synsets[6],Word(\"basics\").synsets[0],Word(\"initiation\").synsets[1],Word(\"start\").synsets[0],Word(\"start\").synsets[4],Word(\"started\").synsets[0],Word(\"started\").synsets[1],Word(\"started\").synsets[7],Word(\"started\").synsets[8],Word(\"overview\").synsets[0],Word(\"summary\").synsets[0],Word(\"summary\").synsets[2]]\n", "contributor = [Word(\"contributor\").synsets[0]]\n", "documentation = [Word(\"documentation\").synsets[1]]\n", "license = [Word(\"license\").synsets[3],Word(\"license\").synsets[0]]\n", "usage = [Word(\"usage\").synsets[0],Word(\"example\").synsets[0],Word(\"example\").synsets[5],Word(\"implement\").synsets[1],Word(\"implementation\").synsets[1],Word(\"demo\").synsets[1],Word(\"tutorial\").synsets[0],Word(\"tutorial\").synsets[1]]\n", "update = [Word(\"updating\").synsets[0],Word(\"updating\").synsets[3]]\n", "issues = [Word(\"issues\").synsets[0],Word(\"errors\").synsets[5],Word(\"problems\").synsets[0],Word(\"problems\").synsets[2]]\n", "support = [Word(\"support\").synsets[7],Word(\"help\").synsets[0],Word(\"help\").synsets[9],Word(\"report\").synsets[0],Word(\"report\").synsets[6]]\n", "\n", "\n", "\n", "group = dict()\n", "group.update({\"citation\":citation})\n", "group.update({\"download\":download})\n", "group.update({\"run\":run})\n", "group.update({\"installation\":install})\n", "group.update({\"requirement\":requirement})\n", "group.update({\"contact\":contact})\n", "group.update({\"description\":description})\n", "group.update({\"contributor\":contributor})\n", "group.update({\"documentation\":documentation})\n", "group.update({\"license\":license})\n", "group.update({\"usage\":usage})\n", "group.update({\"update\":update})\n", "group.update({\"issues\":issues})\n", "group.update({\"support\":support})\n", "\n", "\n", "def find_sim(wordlist,wd): #returns the max probability between a word and subgroup\n", " simvalue = []\n", " for sense in wordlist:\n", " if(wd.path_similarity(sense)!=None):\n", " simvalue.append(wd.path_similarity(sense))\n", " if(len(simvalue)!=0):\n", " return max(simvalue)\n", " else:\n", " return 0\n", " \n", "\n", "def match_group(word_syn,group,threshold):\n", " currmax = 0\n", " maxgroup = \"\"\n", " simvalues = dict()\n", " for sense in word_syn: #for a given sense of a word\n", " similarities = []\n", " for key, value in group.items(): #value has all the similar words\n", " path_sim = find_sim(value,sense)\n", "# print(\"Similarity is:\",path_sim)\n", " if(path_sim>threshold): #then append to the list\n", " if(path_sim>currmax):\n", " maxgroup = key\n", " currmax = path_sim\n", "\n", " return maxgroup\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Header Group\n", "0 Simplemost installation installation\n", "1 Running the software run\n", "2 Running automatically generated scripts run\n", "3 In case of trouble issues\n", "4 Prerequisites requirement\n", "5 Installation installation\n", "6 Reference citation\n", "7 Contact contact\n", "8 Reference citation\n", "9 download single scene by known product id download\n", "10 download all results from the search download\n", "11 Get basic information about the product: its ... description\n", "12 its download url download\n", "13 Get the product's full metadata available on ... description\n", "14 Introduction description\n", "15 Prepare training data installation\n", "16 Begin to train description\n", "17 Quick start description\n", "18 Citation citation\n", "19 a record schema. We can get initial values f... description\n", "20 coordinate reference system as the source. Th... citation\n", "21 Get a point on the boundary of the record's description\n", "22 executes ``dst.flush(); dst.close()``. run\n", "23 Requirements requirement\n", "24 Installation installation\n", "25 Linux Setup with virtualenv installation\n", "26 Install TensorFlow installation\n", "27 Windows Setup with python 3 and Anaconda installation\n", "28 if you need to get chumpy description\n", "29 Demo usage\n", ".. ... ...\n", "344 License license\n", "345 Introduction description\n", "346 Documentation documentation\n", "347 Issues issues\n", "348 License license\n", "349 Installation installation\n", "350 Documentation documentation\n", "351 License license\n", "352 Get the 1st bending mode shape. Results are ... description\n", "353 to avoid getting the \"Factor is exactly singu... description\n", "354 to avoid getting the \"Factor is exactly singu... issues\n", "355 Install or Update installation\n", "356 Install or Update update\n", "357 Usage usage\n", "358 Example of a json file usage\n", "359 Example of a js module usage\n", "360 Getting Started description\n", "361 Getting Started description\n", "362 Installation installation\n", "363 Example usage\n", "364 Documentation documentation\n", "365 Report Bugs support\n", "366 Support support\n", "367 License license\n", "368 Introduction description\n", "369 Requirements requirement\n", "370 Quick Start description\n", "371 Contributors contributor\n", "372 Citation citation\n", "373 License license\n", "\n", "[374 rows x 2 columns]\n" ] } ], "source": [ "datadf = pd.DataFrame({'Header': [], 'Group': []})\n", "\n", "\n", "matchedgroups = []\n", "for h in headers[\"Header\"]:\n", " sentence = h.split(\" \")[1:]\n", " for s in sentence:\n", " synn = Word(s).synsets\n", " if(len(synn)>0):\n", " bestgroup = match_group(synn,group,0.6)\n", " if(bestgroup!=\"\"):\n", " datadf = datadf.append({'Header' : h, 'Group' : bestgroup}, ignore_index=True)\n", "print(datadf)\n", "datadf.to_csv('header_groups.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }