{ "cells": [ { "cell_type": "markdown", "metadata": { "run_control": { "frozen": false, "read_only": false }, "toc": "true" }, "source": [ "# Table of Contents\n", "

" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "import re\n", "import string\n", "from string import digits\n", "import textract\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "pattern = re.compile(r'[0-9](.*?)\\s(Scopus|WoS)', re.DOTALL)\n", "\n", "def read_text(filename):\n", " text = textract.process(filename).replace('.','')\n", " return ' '.join(text.split())\n", "\n", "def format_name(name):\n", " return filter(str.isalnum, name.translate(None, digits).replace(' ','').lower())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "filenames = ['8919877_Journals-1.pdf',\n", " '6988680_Journals-2.pdf',\n", " '9047119_Journals-3.pdf',\n", " '7690152_Journals-4.pdf',\n", " '3554232_Journals-5.pdf']\n", "pdf_text = []\n", "for filename in filenames:\n", " pdf_text.append(read_text(filename))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "with open('Beall_list_oct2016.txt') as f:\n", " beall_list_oct = [x.strip() for x in f.readlines()]\n", "\n", "with open('Beall_list_dec2016.txt') as f:\n", " beall_list_dec = [x.strip() for x in f.readlines()]\n", "\n", "beall_list_oct_formatted = map(lambda x: format_name(x), beall_list_oct)\n", "beall_list_dec_formatted = map(lambda x: format_name(x), beall_list_dec)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "pdf_text_matches = []\n", "for text in pdf_text:\n", " matches = map(lambda x: format_name(x[0]), pattern.findall(text))\n", " pdf_text_matches.append(matches)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "matching_indices = [] \n", "for i, item in enumerate(beall_list_dec_formatted):\n", " for j, pdf_text in enumerate(pdf_text_matches):\n", " if item in pdf_text:\n", " matching_indices.append((beall_list_dec[i],filenames[j]))\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "run_control": { "frozen": false, "read_only": false } }, "outputs": [ { "data": { "text/html": [ "

\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
JournalSource File
0Actual Problems of Economics8919877_Journals-1.pdf
1Aging8919877_Journals-1.pdf
2Australasian Medical Journal8919877_Journals-1.pdf
3Biosciences, Biotechnology Research Asia8919877_Journals-1.pdf
4Der Pharma Chemica6988680_Journals-2.pdf
5European Journal of Science and Theology6988680_Journals-2.pdf
6European Journal of Social Sciences6988680_Journals-2.pdf
7Global Media Journal6988680_Journals-2.pdf
8Interdisciplinary Toxicology6988680_Journals-2.pdf
9International Journal of Health Research6988680_Journals-2.pdf
10International Journal of Network Security6988680_Journals-2.pdf
11International Journal of Pharmacognosy6988680_Journals-2.pdf
12International Journal of Pharmacy and Technology8919877_Journals-1.pdf
13Journal of Animal and Plant Sciences6988680_Journals-2.pdf
14Journal of Applied Linguistics6988680_Journals-2.pdf
15Journal of Applied Pharmaceutical Science6988680_Journals-2.pdf
16Journal of Clinical and Analytical Medicine6988680_Journals-2.pdf
17Journal of Environmental Biology8919877_Journals-1.pdf
18Journal of Environmental Hydrology9047119_Journals-3.pdf
19Journal of Natural Products9047119_Journals-3.pdf
20Journal of Physical Therapy Science9047119_Journals-3.pdf
21Journal of Software9047119_Journals-3.pdf
22Oncoscience7690152_Journals-4.pdf
23PharmacologyOnline7690152_Journals-4.pdf
24Romanian Biotechnological Letters7690152_Journals-4.pdf
25Shiraz E-Medical Journal7690152_Journals-4.pdf
\n", "
" ], "text/plain": [ " Journal Source File\n", "0 Actual Problems of Economics 8919877_Journals-1.pdf\n", "1 Aging 8919877_Journals-1.pdf\n", "2 Australasian Medical Journal 8919877_Journals-1.pdf\n", "3 Biosciences, Biotechnology Research Asia 8919877_Journals-1.pdf\n", "4 Der Pharma Chemica 6988680_Journals-2.pdf\n", "5 European Journal of Science and Theology 6988680_Journals-2.pdf\n", "6 European Journal of Social Sciences 6988680_Journals-2.pdf\n", "7 Global Media Journal 6988680_Journals-2.pdf\n", "8 Interdisciplinary Toxicology 6988680_Journals-2.pdf\n", "9 International Journal of Health Research 6988680_Journals-2.pdf\n", "10 International Journal of Network Security 6988680_Journals-2.pdf\n", "11 International Journal of Pharmacognosy 6988680_Journals-2.pdf\n", "12 International Journal of Pharmacy and Technology 8919877_Journals-1.pdf\n", "13 Journal of Animal and Plant Sciences 6988680_Journals-2.pdf\n", "14 Journal of Applied Linguistics 6988680_Journals-2.pdf\n", "15 Journal of Applied Pharmaceutical Science 6988680_Journals-2.pdf\n", "16 Journal of Clinical and Analytical Medicine 6988680_Journals-2.pdf\n", "17 Journal of Environmental Biology 8919877_Journals-1.pdf\n", "18 Journal of Environmental Hydrology 9047119_Journals-3.pdf\n", "19 Journal of Natural Products 9047119_Journals-3.pdf\n", "20 Journal of Physical Therapy Science 9047119_Journals-3.pdf\n", "21 Journal of Software 9047119_Journals-3.pdf\n", "22 Oncoscience 7690152_Journals-4.pdf\n", "23 PharmacologyOnline 7690152_Journals-4.pdf\n", "24 Romanian Biotechnological Letters 7690152_Journals-4.pdf\n", "25 Shiraz E-Medical Journal 7690152_Journals-4.pdf" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "run_control": { "frozen": false, "read_only": false } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
JournalSource File
0Science and Education7690152_Journals-4.pdf
\n", "
" ], "text/plain": [ " Journal Source File\n", "0 Science and Education 7690152_Journals-4.pdf" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matching_indices = [] \n", "for i, item in enumerate(beall_list_oct_formatted):\n", " for j, pdf_text in enumerate(pdf_text_matches):\n", " if item in pdf_text:\n", " matching_indices.append((beall_list_oct[i],filenames[j]))\n", "df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File'])\n", "df" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "colors": { "hover_highlight": "#DAA520", "running_highlight": "#FF0000", "selected_highlight": "#FFD700" }, "moveMenuLeft": true, "nav_menu": { "height": "12px", "width": "252px" }, "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 4, "toc_cell": true, "toc_section_display": "block", "toc_window_display": true, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 2 }