{ "cells": [ { "cell_type": "markdown", "id": "8ae66b5c", "metadata": {}, "source": [ "# Metadata extraction and preparation" ] }, { "cell_type": "code", "execution_count": 11, "id": "d1b56468", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import os\n", "import json" ] }, { "cell_type": "markdown", "id": "714c7387", "metadata": {}, "source": [ "## Running SOMEF\n", "\n", "[SOMEF](https://github.com/KnowledgeCaptureAndDiscovery/somef) is a tool that automatically extracts relevant information from README files of GitHub/GitLab repositories and saves it as JSON files. We run this tool to extract the metadata from all repositories in the [oeg-upm](https://github.com/oeg-upm/) organisation." ] }, { "cell_type": "code", "execution_count": 9, "id": "a2fe6fd5", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: somef in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (0.9.3)\n", "Requirement already satisfied: imbalanced-learn>=0.8.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (0.8.1)\n", "Requirement already satisfied: bs4==0.0.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (0.0.1)\n", "Requirement already satisfied: matplotlib==3.5.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (3.5.0)\n", "Requirement already satisfied: contractions>=0.1.66 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (0.1.73)\n", "Requirement already satisfied: nltk==3.6.6 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (3.6.6)\n", "Requirement already satisfied: requests>=2.22.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (2.27.1)\n", "Requirement already satisfied: pandas==1.3.4 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (1.3.4)\n", "Requirement already satisfied: pytest in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (7.4.0)\n", "Requirement already satisfied: textblob==0.17.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (0.17.1)\n", "Requirement already satisfied: scipy>=1.7.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (1.9.3)\n", "Requirement already satisfied: markdown==3.3.6 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (3.3.6)\n", "Requirement already satisfied: morph-kgc>=2.3.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (2.3.1)\n", "Requirement already satisfied: scikit-learn==1.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (1.0)\n", "Requirement already satisfied: validators==0.18.2 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (0.18.2)\n", "Requirement already satisfied: Click==7.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (7.0)\n", "Requirement already satisfied: click-option-group==0.5.3 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (0.5.3)\n", "Requirement already satisfied: rdflib-jsonld==0.6.2 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (0.6.2)\n", "Requirement already satisfied: rdflib>=6.0.2 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (6.2.0)\n", "Requirement already satisfied: numpy==1.22.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (1.22.0)\n", "Requirement already satisfied: inflect>=5.4.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (7.0.0)\n", "Requirement already satisfied: xgboost==1.5.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (1.5.0)\n", "Requirement already satisfied: chardet==5.0.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from somef) (5.0.0)\n", "Requirement already satisfied: beautifulsoup4 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from bs4==0.0.1->somef) (4.12.2)\n", "Requirement already satisfied: importlib-metadata>=4.4 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from markdown==3.3.6->somef) (5.0.0)\n", "Requirement already satisfied: setuptools-scm>=4 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from matplotlib==3.5.0->somef) (7.1.0)\n", "Requirement already satisfied: fonttools>=4.22.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from matplotlib==3.5.0->somef) (4.38.0)\n", "Requirement already satisfied: pyparsing>=2.2.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from matplotlib==3.5.0->somef) (3.0.9)\n", "Requirement already satisfied: python-dateutil>=2.7 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from matplotlib==3.5.0->somef) (2.8.2)\n", "Requirement already satisfied: packaging>=20.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from matplotlib==3.5.0->somef) (21.3)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from matplotlib==3.5.0->somef) (1.4.4)\n", "Requirement already satisfied: pillow>=6.2.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from matplotlib==3.5.0->somef) (9.3.0)\n", "Requirement already satisfied: cycler>=0.10 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from matplotlib==3.5.0->somef) (0.11.0)\n", "Requirement already satisfied: regex>=2021.8.3 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from nltk==3.6.6->somef) (2023.6.3)\n", "Requirement already satisfied: joblib in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from nltk==3.6.6->somef) (1.2.0)\n", "Requirement already satisfied: tqdm in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from nltk==3.6.6->somef) (4.63.0)\n", "Requirement already satisfied: pytz>=2017.3 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from pandas==1.3.4->somef) (2022.5)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from scikit-learn==1.0->somef) (3.1.0)\n", "Requirement already satisfied: six>=1.4.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from validators==0.18.2->somef) (1.16.0)\n", "Requirement already satisfied: decorator>=3.4.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from validators==0.18.2->somef) (5.1.1)\n", "Requirement already satisfied: textsearch>=0.0.21 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from contractions>=0.1.66->somef) (0.0.24)\n", "Requirement already satisfied: zipp>=0.5 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from importlib-metadata>=4.4->markdown==3.3.6->somef) (3.9.0)\n", "Requirement already satisfied: typing-extensions in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from inflect>=5.4.0->somef) (4.7.1)\n", "Requirement already satisfied: pydantic>=1.9.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from inflect>=5.4.0->somef) (2.0)\n", "Requirement already satisfied: pyoxigraph>=0.3.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from morph-kgc>=2.3.1->somef) (0.3.6)\n", "Requirement already satisfied: sql-metadata>=2.3.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from morph-kgc>=2.3.1->somef) (2.6.0)\n", "Requirement already satisfied: falcon>=3.0.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from morph-kgc>=2.3.1->somef) (3.1.0)\n", "Requirement already satisfied: jsonpath-python>=1.0.5 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from morph-kgc>=2.3.1->somef) (1.0.6)\n", "Requirement already satisfied: elementpath>=2.4.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from morph-kgc>=2.3.1->somef) (3.0.2)\n", "Requirement already satisfied: duckdb>=0.5.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from morph-kgc>=2.3.1->somef) (0.6.1)\n", "Requirement already satisfied: SQLAlchemy>=1.4.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from morph-kgc>=2.3.1->somef) (1.4.42)\n", "Requirement already satisfied: annotated-types>=0.4.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from pydantic>=1.9.1->inflect>=5.4.0->somef) (0.5.0)\n", "Requirement already satisfied: pydantic-core==2.0.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from pydantic>=1.9.1->inflect>=5.4.0->somef) (2.0.1)\n", "Requirement already satisfied: setuptools in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from rdflib>=6.0.2->somef) (61.2.0)\n", "Requirement already satisfied: isodate in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from rdflib>=6.0.2->somef) (0.6.1)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from requests>=2.22.0->somef) (1.26.8)\n", "Requirement already satisfied: certifi>=2017.4.17 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from requests>=2.22.0->somef) (2021.10.8)\n", "Requirement already satisfied: idna<4,>=2.5 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from requests>=2.22.0->somef) (3.3)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from requests>=2.22.0->somef) (2.0.4)\n", "Requirement already satisfied: tomli>=1.0.0 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from setuptools-scm>=4->matplotlib==3.5.0->somef) (2.0.1)\n", "Requirement already satisfied: sqlparse<0.5.0,>=0.4.1 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from sql-metadata>=2.3.0->morph-kgc>=2.3.1->somef) (0.4.3)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: greenlet!=0.4.17 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from SQLAlchemy>=1.4.0->morph-kgc>=2.3.1->somef) (1.1.3.post0)\n", "Requirement already satisfied: anyascii in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from textsearch>=0.0.21->contractions>=0.1.66->somef) (0.3.2)\n", "Requirement already satisfied: pyahocorasick in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from textsearch>=0.0.21->contractions>=0.1.66->somef) (2.0.0)\n", "Requirement already satisfied: soupsieve>1.2 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from beautifulsoup4->bs4==0.0.1->somef) (2.4.1)\n", "Requirement already satisfied: exceptiongroup>=1.0.0rc8 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from pytest->somef) (1.1.2)\n", "Requirement already satisfied: pluggy<2.0,>=0.12 in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from pytest->somef) (1.2.0)\n", "Requirement already satisfied: iniconfig in /Users/aiglesias/miniconda3/lib/python3.9/site-packages (from pytest->somef) (2.0.0)\n", "/Users/aiglesias/miniconda3/lib/python3.9/runpy.py:127: RuntimeWarning: 'nltk.downloader' found in sys.modules after import of package 'nltk', but prior to execution of 'nltk.downloader'; this may result in unpredictable behaviour\n", " warn(RuntimeWarning(msg))\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] /Users/aiglesias/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "/Users/aiglesias/miniconda3/lib/python3.9/runpy.py:127: RuntimeWarning: 'nltk.downloader' found in sys.modules after import of package 'nltk', but prior to execution of 'nltk.downloader'; this may result in unpredictable behaviour\n", " warn(RuntimeWarning(msg))\n", "[nltk_data] Downloading package omw-1.4 to\n", "[nltk_data] /Users/aiglesias/nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n", "/Users/aiglesias/miniconda3/lib/python3.9/runpy.py:127: RuntimeWarning: 'nltk.downloader' found in sys.modules after import of package 'nltk', but prior to execution of 'nltk.downloader'; this may result in unpredictable behaviour\n", " warn(RuntimeWarning(msg))\n", "[nltk_data] Downloading package punkt to /Users/aiglesias/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "/Users/aiglesias/miniconda3/lib/python3.9/runpy.py:127: RuntimeWarning: 'nltk.downloader' found in sys.modules after import of package 'nltk', but prior to execution of 'nltk.downloader'; this may result in unpredictable behaviour\n", " warn(RuntimeWarning(msg))\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/aiglesias/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/requests/__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.8) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!\n", " warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n", "SOftware Metadata Extraction Framework (SOMEF) Command Line Interface\n", "Configuring SOMEF automatically. To assign credentials edit the configuration file or run the interactive mode\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] /Users/aiglesias/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package omw-1.4 to\n", "[nltk_data] /Users/aiglesias/nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n", "04-Jul-23 17:34:34-INFO-Configuration file saved at /Users/aiglesias/.somef\n", "\u001b[32mSuccess\u001b[0m\n" ] } ], "source": [ "!pip3 install somef\n", "!python -m nltk.downloader wordnet\n", "!python -m nltk.downloader omw-1.4\n", "!python -m nltk.downloader punkt\n", "!python -m nltk.downloader stopwords\n", "!somef configure -a" ] }, { "cell_type": "markdown", "id": "02eed579", "metadata": {}, "source": [ "Once installed and configured, we run the tool to extract the metadata of Mapeathor from [https://github.com/oeg-upm/mapeathor](https://github.com/oeg-upm/mapeathor). To extract every repository in an organisation, the same process needs to be repeated for every repository URL. We show here the extraction of one to exemplify the process." ] }, { "cell_type": "code", "execution_count": 10, "id": "2df0a34d", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/requests/__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.8) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!\n", " warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n", "SOftware Metadata Extraction Framework (SOMEF) Command Line Interface\n", "04-Jul-23 17:34:48-INFO-Loading Repository https://github.com/oeg-upm/mapeathor Information....\n", "04-Jul-23 17:34:48-DEBUG-Starting new HTTPS connection (1): api.github.com:443\n", "04-Jul-23 17:34:48-DEBUG-https://api.github.com:443 \"GET /repos/oeg-upm/mapeathor HTTP/1.1\" 200 1465\n", "04-Jul-23 17:34:48-INFO-Remaining GitHub API requests: 59 ### Next rate limit reset at: 2023-07-04 18:34:48\n", "04-Jul-23 17:34:48-DEBUG-Starting new HTTPS connection (1): api.github.com:443\n", "04-Jul-23 17:34:48-DEBUG-https://api.github.com:443 \"GET /repos/oeg-upm/mapeathor/languages HTTP/1.1\" 200 53\n", "04-Jul-23 17:34:48-INFO-Remaining GitHub API requests: 58 ### Next rate limit reset at: 2023-07-04 18:34:48\n", "04-Jul-23 17:34:48-DEBUG-Starting new HTTPS connection (1): api.github.com:443\n", "04-Jul-23 17:34:49-DEBUG-https://api.github.com:443 \"GET /repos/oeg-upm/mapeathor/releases HTTP/1.1\" 200 1680\n", "04-Jul-23 17:34:49-INFO-Remaining GitHub API requests: 57 ### Next rate limit reset at: 2023-07-04 18:34:47\n", "04-Jul-23 17:34:49-WARNING-Ignoring empty value in release for description\n", "04-Jul-23 17:34:49-INFO-Repository information successfully loaded.\n", "\n", "04-Jul-23 17:34:49-INFO-Downloading https://github.com/oeg-upm/mapeathor/archive/master.zip\n", "04-Jul-23 17:34:49-DEBUG-Starting new HTTPS connection (1): github.com:443\n", "04-Jul-23 17:34:49-DEBUG-https://github.com:443 \"GET /oeg-upm/mapeathor/archive/master.zip HTTP/1.1\" 302 0\n", "04-Jul-23 17:34:49-DEBUG-Starting new HTTPS connection (1): codeload.github.com:443\n", "04-Jul-23 17:34:49-DEBUG-https://codeload.github.com:443 \"GET /oeg-upm/mapeathor/zip/refs/heads/master HTTP/1.1\" 200 None\n", "04-Jul-23 17:34:50-INFO-Extracting information using headers\n", "04-Jul-23 17:34:50-INFO-Labeling headers.\n", "04-Jul-23 17:34:53-INFO-Header information extracted.\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator TfidfTransformer from version 1.0.2 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator TfidfVectorizer from version 1.0.2 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator LinearSVC from version 1.0.2 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator _SigmoidCalibration from version 1.0.2 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator CalibratedClassifierCV from version 1.0.2 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.2 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator TfidfTransformer from version 1.0.1 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator TfidfVectorizer from version 1.0.1 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator LinearSVC from version 1.0.1 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator _SigmoidCalibration from version 1.0.1 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "/Users/aiglesias/miniconda3/lib/python3.9/site-packages/sklearn/base.py:324: UserWarning: Trying to unpickle estimator CalibratedClassifierCV from version 1.0.1 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", "https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations\n", " warnings.warn(\n", "04-Jul-23 17:35:00-INFO-Splitting text into valid excerpts for classification\n", "04-Jul-23 17:35:00-INFO-Extraction of bibtex citation from readme completed. \n", "\n", "04-Jul-23 17:35:00-INFO-Text Successfully split.\n", "04-Jul-23 17:35:00-INFO-Classifying excerpts for the category description\n", "04-Jul-23 17:35:00-INFO-Classifying excerpts for the category installation\n", "04-Jul-23 17:35:00-INFO-Classifying excerpts for the category invocation\n", "04-Jul-23 17:35:00-INFO-Checking thresholds for classified excerpts.\n", "04-Jul-23 17:35:00-INFO-All excerpts below the threshold have been removed.\n", "04-Jul-23 17:35:00-DEBUG-Starting new HTTPS connection (1): github.com:443\n", "04-Jul-23 17:35:00-DEBUG-https://github.com:443 \"GET /oeg-upm/mapeathor/wiki HTTP/1.1\" 200 None\n", "04-Jul-23 17:35:00-DEBUG-Starting new HTTPS connection (1): pypi.python.org:443\n", "04-Jul-23 17:35:00-DEBUG-https://pypi.python.org:443 \"GET /pypi/mapeathor HTTP/1.1\" 301 122\n", "04-Jul-23 17:35:00-DEBUG-Starting new HTTPS connection (1): pypi.org:443\n", "04-Jul-23 17:35:00-DEBUG-https://pypi.org:443 \"GET /pypi/mapeathor HTTP/1.1\" 301 213\n", "04-Jul-23 17:35:01-DEBUG-https://pypi.org:443 \"GET /pypi/mapeathor/ HTTP/1.1\" 301 216\n", "04-Jul-23 17:35:01-DEBUG-https://pypi.org:443 \"GET /project/mapeathor/ HTTP/1.1\" 200 13822\n", "04-Jul-23 17:35:01-INFO-Completed extracting regular expressions\n", "Saving json data to ../data/out.json\n", "\u001b[32mSuccess\u001b[0m\n" ] } ], "source": [ "!somef describe -r https://github.com/oeg-upm/mapeathor -o ../data/somef-data/mapeathor.json" ] }, { "cell_type": "markdown", "id": "c1cfc820", "metadata": {}, "source": [ "## Input JSON preparation\n", "\n", "Once all desired repositories are processed, we take the list of json files, one corresponding to one repository, remove from the list the repositories that contain ontologies or websites, and merge the remaining ones into one JSON file." ] }, { "cell_type": "markdown", "id": "e98e838f", "metadata": {}, "source": [ "### Clear input JSONs\n", "Removal of names of GitHub repositories containing webpages or ontologies from the list of repositories within the [oeg-upm](https://github.com/oeg-upm/) organisation." ] }, { "cell_type": "code", "execution_count": 2, "id": "897852ac", "metadata": {}, "outputs": [], "source": [ "path = '../data/somef-data/single-json/'\n", "json_file_names = os.listdir(path)" ] }, { "cell_type": "code", "execution_count": 4, "id": "a5d08ab7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['oeg-upm_weather1_2023-06-28.json',\n", " 'oeg-upm_bimerr-core_2023-06-28.json',\n", " 'oeg-upm_MIRROR_2023-06-28.json',\n", " 'oeg-upm_bimerr-metadata_2023-06-28.json',\n", " 'oeg-upm_pcake_2023-06-28.json',\n", " 'oeg-upm_cogito-coppola_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-userinterface_2023-06-28.json',\n", " 'oeg-upm_terminology-extractor-incibe_2023-06-28.json',\n", " 'oeg-upm_AttentionRankLib_2023-06-28.json',\n", " 'oeg-upm_ckanext-federgob_2023-06-28.json',\n", " 'oeg-upm_saref-ext_2023-06-28.json',\n", " 'oeg-upm_LabSensingArduino_2023-06-28.json',\n", " 'oeg-upm_esuk_2023-06-28.json',\n", " 'oeg-upm_bimerr-materials_2023-06-28.json',\n", " 'oeg-upm_hola-si-protocol_2023-06-28.json',\n", " 'oeg-upm_termlex_2023-06-28.json',\n", " 'oeg-upm_CODICE-extractor_2023-06-28.json',\n", " 'oeg-upm_ssspotter_2023-06-28.json',\n", " 'oeg-upm_OnToology-view-mock_2023-06-28.json',\n", " 'oeg-upm_Wikidata-class-diagram-generator_2023-06-28.json',\n", " 'oeg-upm_ontologia-ciberseguridad_2023-06-28.json',\n", " 'oeg-upm_tada-map-score_2023-06-28.json',\n", " 'oeg-upm_drugs4covid19-nlp_2023-06-28.json',\n", " 'oeg-upm_CJCyL_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-engine-mappings_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-engine-ws_2023-06-28.json',\n", " 'oeg-upm_cogito-kgg_2023-06-28.json',\n", " 'oeg-upm_tada-reduce-combine_2023-06-28.json',\n", " 'oeg-upm_bimerr-renovation-process_2023-06-28.json',\n", " 'oeg-upm_t5-spanish-news-summarization_2023-06-28.json',\n", " 'oeg-upm_FarolAppsWeb_2023-06-28.json',\n", " 'oeg-upm_drugs4covid19-vocab_2023-06-28.json',\n", " 'oeg-upm_JSONPath-to-SPARQL_2023-06-28.json',\n", " 'oeg-upm_S4WATR_2023-06-28.json',\n", " 'oeg-upm_epw2rdf-contents_2023-06-28.json',\n", " 'oeg-upm_personal-repo_2023-06-28.json',\n", " 'oeg-upm_declarative-functions_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-engine-datasets_2023-06-28.json',\n", " 'oeg-upm_widaug_2023-06-28.json',\n", " 'oeg-upm_oatapi_2023-06-28.json',\n", " 'oeg-upm_agora-py_2023-06-28.json',\n", " 'oeg-upm_BRATtoBIO_2023-06-28.json',\n", " 'oeg-upm_yatter_2023-06-28.json',\n", " 'oeg-upm_Massive-ROs-Creator_2023-06-28.json',\n", " 'oeg-upm_201612-clarityhackathon-upm_2023-06-28.json',\n", " 'oeg-upm_corpuser_2023-06-28.json',\n", " 'oeg-upm_TPDL2022_FAIROs_2023-06-28.json',\n", " 'oeg-upm_spread-sheet-space-apis_2023-06-28.json',\n", " 'oeg-upm_Widoco_2023-06-28.json',\n", " 'oeg-upm_doccano_formatter_2023-06-28.json',\n", " 'oeg-upm_cpv-classifier_2023-06-28.json',\n", " 'oeg-upm_morph-gft_2023-06-28.json',\n", " 'oeg-upm_github-action-morph-kgc_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-contents_2023-06-28.json',\n", " 'oeg-upm_outlinejs_2023-06-28.json',\n", " 'oeg-upm_Devos_2023-06-28.json',\n", " 'oeg-upm_gtfs-csv2rdf_2023-06-28.json',\n", " 'oeg-upm_software_catalog_2023-06-28.json',\n", " 'oeg-upm_easysparql_2023-06-28.json',\n", " 'oeg-upm_OEG-tutorial-template_2023-06-28.json',\n", " 'oeg-upm_tada-entity_2023-06-28.json',\n", " 'oeg-upm_Ethereum-Smart-Contract-Downloader_2023-06-28.json',\n", " 'oeg-upm_BO2DM_2023-06-28.json',\n", " 'oeg-upm_OWL-To-OAS-Specification_2023-06-28.json',\n", " 'oeg-upm_cogito-kgg-module_2023-06-28.json',\n", " 'oeg-upm_morph-skyline_2023-06-28.json',\n", " 'oeg-upm_mgds-ids_2023-06-28.json',\n", " 'oeg-upm_bimerr-annotation-objects_2023-06-28.json',\n", " 'oeg-upm_Conceptual-Mapping_2023-06-28.json',\n", " 'oeg-upm_tada-api_2023-06-28.json',\n", " 'oeg-upm_oeg-upm-github-io_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-engine-datasets-ws_2023-06-28.json',\n", " 'oeg-upm_LDP4RO_2023-06-28.json',\n", " 'oeg-upm_SMART-Protocols_2023-06-28.json',\n", " 'oeg-upm_yarrrml-validation_2023-06-28.json',\n", " 'oeg-upm_ner4soft_2023-06-28.json',\n", " 'oeg-upm_morph-kgc-1_2023-06-28.json',\n", " 'oeg-upm_TPool_2023-06-28.json',\n", " 'oeg-upm_wot-hive_2023-06-28.json',\n", " 'oeg-upm_Cloud_Helio_Adapter_2023-06-28.json',\n", " 'oeg-upm_beto-covid-sentiment-analysis_2023-06-28.json',\n", " 'oeg-upm_DIoT_2023-06-28.json',\n", " 'oeg-upm_pycpulimit_2023-06-28.json',\n", " 'oeg-upm_BIMERR-KGG_2023-06-28.json',\n", " 'oeg-upm_morph-streams_2023-06-28.json',\n", " 'oeg-upm_terminology-extractor_2023-06-28.json',\n", " 'oeg-upm_w3id-org_2023-06-28.json',\n", " 'oeg-upm_Instituto-Estudios-Fiscales-ontologias_2023-06-28.json',\n", " 'oeg-upm_srbench_2023-06-28.json',\n", " 'oeg-upm_ssn-resource-center_2023-06-28.json',\n", " 'oeg-upm_easytv-onto_2023-06-28.json',\n", " 'oeg-upm_sprint_2023-06-28.json',\n", " 'oeg-upm_OTALEX-C_2023-06-28.json',\n", " 'oeg-upm_Morph-OME_2023-06-28.json',\n", " 'oeg-upm_things-manager-platform_2023-06-28.json',\n", " 'oeg-upm_ya2ro_2023-06-28.json',\n", " 'oeg-upm_SmartDeveloperHub-github-io_2023-06-28.json',\n", " 'oeg-upm_oeg-software-graph_2023-06-28.json',\n", " 'oeg-upm_AI4EU_raidologist_2023-06-28.json',\n", " 'oeg-upm_helio-materialiser_2023-06-28.json',\n", " 'oeg-upm_DeltaCimApp_2023-06-28.json',\n", " 'oeg-upm_gwt-blocks_2023-06-28.json',\n", " 'oeg-upm_valkyr-ie-py_2023-06-28.json',\n", " 'oeg-upm_rdf-star-generation_2023-06-28.json',\n", " 'oeg-upm_linked-gtfs_2023-06-28.json',\n", " 'oeg-upm_drugs4covid19-kg_2023-06-28.json',\n", " 'oeg-upm_mapeathor_2023-06-28.json',\n", " 'oeg-upm_loom-ld_2023-06-28.json',\n", " 'oeg-upm_docker-freeling4_2023-06-28.json',\n", " 'oeg-upm_fuzzy-c-means_2023-06-28.json',\n", " 'oeg-upm_licensius_2023-06-28.json',\n", " 'oeg-upm_ttla_2023-06-28.json',\n", " 'oeg-upm_easytv-resources_2023-06-28.json',\n", " 'oeg-upm_confs-info_2023-06-28.json',\n", " 'oeg-upm_PPool_2023-06-28.json',\n", " 'oeg-upm_btn100_2023-06-28.json',\n", " 'oeg-upm_Themis_2023-06-28.json',\n", " 'oeg-upm_vocabUpdates_2023-06-28.json',\n", " 'oeg-upm_vocab-linkeddata-es_2023-06-28.json',\n", " 'oeg-upm_tada-type-graph_2023-06-28.json',\n", " 'oeg-upm_agora-gw_2023-06-28.json',\n", " 'oeg-upm_FAIR-Research-Object-API_2023-06-28.json',\n", " 'oeg-upm_drugs4covid19_2023-06-28.json',\n", " 'oeg-upm_morph-csv_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-contents-test_2023-06-28.json',\n", " 'oeg-upm_seed-sensors-oeg_2023-06-28.json',\n", " 'oeg-upm_transforming-term-extraction-lib_2023-06-28.json',\n", " 'oeg-upm_vkg-tutorial-eswc2019_2023-06-28.json',\n", " 'oeg-upm_vicinity-ontologies_2023-06-28.json',\n", " 'oeg-upm_dataeuropa-analysis_2023-06-28.json',\n", " 'oeg-upm_cogito_data_repository_2023-06-28.json',\n", " 'oeg-upm_epnoi_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-engine-executions_2023-06-28.json',\n", " 'oeg-upm_bimerr-material-properties_2023-06-28.json',\n", " 'oeg-upm_bimerr-building_2023-06-28.json',\n", " 'oeg-upm_bimerr-weather_2023-06-28.json',\n", " 'oeg-upm_ar2dtool-oegfork_2023-06-28.json',\n", " 'oeg-upm_IEBrain_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-engine-mappings-ws_2023-06-28.json',\n", " 'oeg-upm_agora-wot_2023-06-28.json',\n", " 'oeg-upm_morph-geo_2023-06-28.json',\n", " 'oeg-upm_github-action-sparql_2023-06-28.json',\n", " 'oeg-upm_bimerr-obxml_2023-06-28.json',\n", " 'oeg-upm_helio_2023-06-28.json',\n", " 'oeg-upm_geo-agreement_2023-06-28.json',\n", " 'oeg-upm_docker-geokettle-x3geo_2023-06-28.json',\n", " 'oeg-upm_Sancus_2023-06-28.json',\n", " 'oeg-upm_devops-infra_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-engine_2023-06-28.json',\n", " 'oeg-upm_bimerr-information-objects_2023-06-28.json',\n", " 'oeg-upm_tada-qq_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-engine-executions-ws_2023-06-28.json',\n", " 'oeg-upm_fair_ontologies_2023-06-28.json',\n", " 'oeg-upm_geo-linkeddata-es-TripleGeoKettle_2023-06-28.json',\n", " 'oeg-upm_city4age_2023-06-28.json',\n", " 'oeg-upm_Chowlk_2023-06-28.json',\n", " 'oeg-upm_bimerr-senML_2023-06-28.json',\n", " 'oeg-upm_mappingpedia-engine-commons_2023-06-28.json',\n", " 'oeg-upm_LOT-resources_2023-06-28.json',\n", " 'oeg-upm_lubm4obda_2023-06-28.json',\n", " 'oeg-upm_snap-docs_2023-06-28.json',\n", " 'oeg-upm_OnToologyDnD_2023-06-28.json',\n", " 'oeg-upm_Jarsomatic_2023-06-28.json',\n", " 'oeg-upm_map4rdf_2023-06-28.json',\n", " 'oeg-upm_gtfs-bench_2023-06-28.json',\n", " 'oeg-upm_LLD-Search_2023-06-28.json',\n", " 'oeg-upm_FAIR-Research-Object_2023-06-28.json',\n", " 'oeg-upm_WordPress-RDFa_2023-06-28.json',\n", " 'oeg-upm_morph-rdb_2023-06-28.json',\n", " 'oeg-upm_IVHackathonOpenDataCaceres-UPM_2023-06-28.json',\n", " 'oeg-upm_ipfs-service_2023-06-28.json',\n", " 'oeg-upm_owl2yarrrml_2023-06-28.json',\n", " 'oeg-upm_coppola_2023-06-28.json',\n", " 'oeg-upm_kehio_2023-06-28.json',\n", " 'oeg-upm_helio-publisher_2023-06-28.json',\n", " 'oeg-upm_DBpedia-downloader_2023-06-28.json',\n", " 'oeg-upm_eWoT_2023-06-28.json',\n", " 'oeg-upm_cogito-sparql_2023-06-28.json',\n", " 'oeg-upm_chowlk_spec_2023-06-28.json',\n", " 'oeg-upm_fcm-cpp_2023-06-28.json',\n", " 'oeg-upm_lynx-py_2023-06-28.json',\n", " 'oeg-upm_cogito_thing_manager_module_2023-06-28.json',\n", " 'oeg-upm_tada-hdt-numeric_2023-06-28.json',\n", " 'oeg-upm_bimerr-health-security_2023-06-28.json',\n", " 'oeg-upm_easytv-semantic-annotator_2023-06-28.json',\n", " 'oeg-upm_TINTO_2023-06-28.json',\n", " 'oeg-upm_cogito_final_thing_manager_2023-06-28.json',\n", " 'oeg-upm_kgc-eval_2023-06-28.json',\n", " 'oeg-upm_oops-plugin_2023-06-28.json',\n", " 'oeg-upm_valkyr-ie-gate_2023-06-28.json',\n", " 'oeg-upm_SendEmailWebApp_2023-06-28.json',\n", " 'oeg-upm_mobility_2023-06-28.json',\n", " 'oeg-upm_SancusDemo_2023-06-28.json',\n", " 'oeg-upm_virtuoso-triple-store_2023-06-28.json',\n", " 'oeg-upm_bimerr-epw_2023-06-28.json',\n", " 'oeg-upm_kgc-tutorial-iswc2020_2023-06-28.json',\n", " 'oeg-upm_helio-plugins_2023-06-28.json',\n", " 'oeg-upm_astrea_2023-06-28.json',\n", " 'oeg-upm_loupe-api_2023-06-28.json',\n", " 'oeg-upm_epw2rdf_2023-06-28.json',\n", " 'oeg-upm_soca_2023-06-28.json',\n", " 'oeg-upm_github-action-pretty-yarrrml2rml_2023-06-28.json',\n", " 'oeg-upm_tada-gam_2023-06-28.json',\n", " 'oeg-upm_wot-jtd_2023-06-28.json',\n", " 'oeg-upm_drugs4covid19-cs_2023-06-28.json',\n", " 'oeg-upm_wikidata-label-extractor_2023-06-28.json',\n", " 'oeg-upm_mobileage-platform-example_2023-06-28.json',\n", " 'oeg-upm_FAIROS_DAMALOS_2023_2023-06-28.json',\n", " 'oeg-upm_vocabTest_2023-06-28.json',\n", " 'oeg-upm_GEnI_2023-06-28.json',\n", " 'oeg-upm_cogito_wrapper_module_2023-06-28.json',\n", " 'oeg-upm_helio-framework_2023-06-28.json',\n", " 'oeg-upm_subject_column_election_2023-06-28.json',\n", " 'oeg-upm_tada-hdt-entity-experiment_2023-06-28.json',\n", " 'oeg-upm_rmlc-statistic_2023-06-28.json',\n", " 'oeg-upm_ROCrate_enrichment_service_2023-06-28.json',\n", " 'oeg-upm_bimerr-renovation-measures_2023-06-28.json',\n", " 'oeg-upm_r4r_2023-06-28.json',\n", " 'oeg-upm_sdg-text-retriever_2023-06-28.json',\n", " 'oeg-upm_bimerr-occupant-behavior_2023-06-28.json',\n", " 'oeg-upm_WAugNER_2023-06-28.json',\n", " 'oeg-upm_hcommonk-anonymizer_2023-06-28.json',\n", " 'oeg-upm_pytada-hdt-entity_2023-06-28.json',\n", " 'oeg-upm_tada-hdt-entity_2023-06-28.json',\n", " 'oeg-upm_easytv-annotator_2023-06-28.json',\n", " 'oeg-upm_transmodel-cq_2023-06-28.json',\n", " 'oeg-upm_ShapesToWidocoHTML_2023-06-28.json',\n", " 'oeg-upm_TINTOlib-Documentation_2023-06-28.json',\n", " 'oeg-upm_bimerr-kpi_2023-06-28.json',\n", " 'oeg-upm_ainn-request_2023-06-28.json',\n", " 'oeg-upm_ainn-userm_2023-06-28.json',\n", " 'oeg-upm_covid19_2023-06-28.json']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "files_to_delete = []\n", "for file in json_file_names:\n", " if \"web\" in file or \"ontology\" in file:\n", " files_to_delete.append(file)\n", "\n", "clean_json = [file for file in json_file_names if file not in files_to_delete]\n", "clean_json" ] }, { "cell_type": "markdown", "id": "fee8049d", "metadata": {}, "source": [ "### Merge individual json into one\n", "The selected JSON files are merged into one file to facilitate posterior construction of the knowledge graph." ] }, { "cell_type": "code", "execution_count": null, "id": "80d2fd3c", "metadata": {}, "outputs": [], "source": [ "merged_json = []\n", "for file in clean_json:\n", " filename = path + file\n", " with open(filename, 'r') as infile:\n", " #print(json.load(infile))\n", " merged_json.append(json.load(infile))\n", " \n", " with open('/Users/aiglesias/GitHub/oeg-software-graph/data/somef.json', 'w') as out_json:\n", " json.dump(merged_json, out_json)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 5 }