{ "cells": [ { "cell_type": "markdown", "metadata": { "run_control": { "frozen": false, "read_only": false }, "toc": "true" }, "source": [ "# Table of Contents\n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "import re\n", "import string\n", "from string import digits\n", "import textract\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "pattern = re.compile(r'[0-9](.*?)\\s(Scopus|WoS)', re.DOTALL)\n", "\n", "def read_text(filename):\n", " text = textract.process(filename).replace('.','')\n", " return ' '.join(text.split())\n", "\n", "def format_name(name):\n", " return filter(str.isalnum, name.translate(None, digits).replace(' ','').lower())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "filenames = ['8919877_Journals-1.pdf',\n", " '6988680_Journals-2.pdf',\n", " '9047119_Journals-3.pdf',\n", " '7690152_Journals-4.pdf',\n", " '3554232_Journals-5.pdf']\n", "pdf_text = []\n", "for filename in filenames:\n", " pdf_text.append(read_text(filename))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "with open('Beall_list_oct2016.txt') as f:\n", " beall_list_oct = [x.strip() for x in f.readlines()]\n", "\n", "with open('Beall_list_dec2016.txt') as f:\n", " beall_list_dec = [x.strip() for x in f.readlines()]\n", "\n", "beall_list_oct_formatted = map(lambda x: format_name(x), beall_list_oct)\n", "beall_list_dec_formatted = map(lambda x: format_name(x), beall_list_dec)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "pdf_text_matches = []\n", "for text in pdf_text:\n", " matches = map(lambda x: format_name(x[0]), pattern.findall(text))\n", " pdf_text_matches.append(matches)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "matching_indices = [] \n", "for i, item in enumerate(beall_list_dec_formatted):\n", " for j, pdf_text in enumerate(pdf_text_matches):\n", " if item in pdf_text:\n", " matching_indices.append((beall_list_dec[i],filenames[j]))\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true, "run_control": { "frozen": false, "read_only": false } }, "outputs": [], "source": [ "df = pd.DataFrame(matching_indices, columns=['Journal', 'Source File'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "run_control": { "frozen": false, "read_only": false } }, "outputs": [ { "data": { "text/html": [ "
\n", " | Journal | \n", "Source File | \n", "
---|---|---|
0 | \n", "Actual Problems of Economics | \n", "8919877_Journals-1.pdf | \n", "
1 | \n", "Aging | \n", "8919877_Journals-1.pdf | \n", "
2 | \n", "Australasian Medical Journal | \n", "8919877_Journals-1.pdf | \n", "
3 | \n", "Biosciences, Biotechnology Research Asia | \n", "8919877_Journals-1.pdf | \n", "
4 | \n", "Der Pharma Chemica | \n", "6988680_Journals-2.pdf | \n", "
5 | \n", "European Journal of Science and Theology | \n", "6988680_Journals-2.pdf | \n", "
6 | \n", "European Journal of Social Sciences | \n", "6988680_Journals-2.pdf | \n", "
7 | \n", "Global Media Journal | \n", "6988680_Journals-2.pdf | \n", "
8 | \n", "Interdisciplinary Toxicology | \n", "6988680_Journals-2.pdf | \n", "
9 | \n", "International Journal of Health Research | \n", "6988680_Journals-2.pdf | \n", "
10 | \n", "International Journal of Network Security | \n", "6988680_Journals-2.pdf | \n", "
11 | \n", "International Journal of Pharmacognosy | \n", "6988680_Journals-2.pdf | \n", "
12 | \n", "International Journal of Pharmacy and Technology | \n", "8919877_Journals-1.pdf | \n", "
13 | \n", "Journal of Animal and Plant Sciences | \n", "6988680_Journals-2.pdf | \n", "
14 | \n", "Journal of Applied Linguistics | \n", "6988680_Journals-2.pdf | \n", "
15 | \n", "Journal of Applied Pharmaceutical Science | \n", "6988680_Journals-2.pdf | \n", "
16 | \n", "Journal of Clinical and Analytical Medicine | \n", "6988680_Journals-2.pdf | \n", "
17 | \n", "Journal of Environmental Biology | \n", "8919877_Journals-1.pdf | \n", "
18 | \n", "Journal of Environmental Hydrology | \n", "9047119_Journals-3.pdf | \n", "
19 | \n", "Journal of Natural Products | \n", "9047119_Journals-3.pdf | \n", "
20 | \n", "Journal of Physical Therapy Science | \n", "9047119_Journals-3.pdf | \n", "
21 | \n", "Journal of Software | \n", "9047119_Journals-3.pdf | \n", "
22 | \n", "Oncoscience | \n", "7690152_Journals-4.pdf | \n", "
23 | \n", "PharmacologyOnline | \n", "7690152_Journals-4.pdf | \n", "
24 | \n", "Romanian Biotechnological Letters | \n", "7690152_Journals-4.pdf | \n", "
25 | \n", "Shiraz E-Medical Journal | \n", "7690152_Journals-4.pdf | \n", "
\n", " | Journal | \n", "Source File | \n", "
---|---|---|
0 | \n", "Science and Education | \n", "7690152_Journals-4.pdf | \n", "