# 5. Journals indexed in Scopus by their disciplinary distribution

### Notebook objectives:
1. Determine the disciplinary distribution of Scopus journals for the sake of comparison to OJS.
*Updated 9/22/2022

In [1]:
!python --version

Python 3.10.5


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import requests
import json
import os
import re
from easynmt import EasyNMT
from tqdm import tqdm

In [3]:
scopus = pd.read_excel(os.path.join('data', 'scopus_jan2021.xlsx'))
scopus = scopus.drop_duplicates(subset=["Print-ISSN", "E-ISSN"])

### Education isn't a defined subject area in the Scopus data, so I approximate the number of Education journals using a multilngual string search for "education," "teach," and "learn" in journal titles:

In [4]:
langs = scopus['Article language in source (three-letter ISO language codes)'].unique().tolist()
langs = [re.split(r"[^a-zA-Z]+", l) for l in langs if isinstance(l, str)]
langs = sorted(list(set([l for subl in langs for l in subl])))
print(len(langs), "\n", langs)

49 
 ['AFR', 'ARA', 'ARM', 'AZE', 'BAQ', 'BOS', 'BUL', 'CAT', 'CHI', 'CHN', 'CZE', 'DAN', 'DUT', 'ENF', 'ENG', 'EST', 'FIN', 'FRE', 'GER', 'GLE', 'GLG', 'GRE', 'HEB', 'HUN', 'ICE', 'IND', 'ITA', 'JPN', 'KOR', 'LAV', 'LIT', 'MAC', 'MAO', 'MAY', 'NOR', 'PER', 'POL', 'POR', 'RUM', 'RUS', 'SCC', 'SCR', 'SLO', 'SLV', 'SPA', 'SWE', 'THA', 'TUR', 'UKR']


In [5]:
print([f"{lang}: {scopus.iloc[:, 7].str.contains(lang).sum()}" for lang in langs])

['AFR: 13', 'ARA: 24', 'ARM: 1', 'AZE: 3', 'BAQ: 6', 'BOS: 9', 'BUL: 18', 'CAT: 33', 'CHI: 562', 'CHN: 1', 'CZE: 131', 'DAN: 14', 'DUT: 71', 'ENF: 1', 'ENG: 27125', 'EST: 19', 'FIN: 18', 'FRE: 1197', 'GER: 997', 'GLE: 5', 'GLG: 2', 'GRE: 38', 'HEB: 7', 'HUN: 44', 'ICE: 3', 'IND: 4', 'ITA: 535', 'JPN: 199', 'KOR: 69', 'LAV: 8', 'LIT: 19', 'MAC: 2', 'MAO: 1', 'MAY: 12', 'NOR: 20', 'PER: 46', 'POL: 190', 'POR: 474', 'RUM: 53', 'RUS: 412', 'SCC: 15', 'SCR: 109', 'SLO: 64', 'SLV: 61', 'SPA: 1348', 'SWE: 23', 'THA: 3', 'TUR: 134', 'UKR: 26']


#### Save a list of ISO-639-1 language codes with Latin scripts, because the Scopus data only feature titles written in Latin scripts:
(Transliteration is used by Scopus, but transliterating "Education" to Chinese "Jiaoyu" returns no titles. I will skip transliteration because the success rate seems so low.)

In [6]:
iso639_1 = ["af", "ca", "cs", "da", "nl", "et", "fi", "fr", "de", "hu", "id", "it", "lv", "lt", "no", "pl",
 "pt", "ro", "sr", "sk", "sl", "es", "sv", "tr"]

In [7]:
model = EasyNMT("opus-mt")

In [8]:
doc = ["education",
 "learn",
 "teach"]
edu = []

for code in iso639_1:
 try:
 edu.extend(model.translate(doc, target_lang=code))
 except OSError:
 continue
print(edu)

Exception: Helsinki-NLP/opus-mt-en-lv is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.
Exception: Helsinki-NLP/opus-mt-en-lt is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.
Exception: Helsinki-NLP/opus-mt-en-no is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.
Exception: Helsinki-NLP/opus-mt-en-pl

['Opvoeding', 'leer', 'onderrig', 'educació', 'learn', 'ensenyeu', 'vzdělávání', 'učit se', 'Učit', 'uddannelse', 'lær', 'underviser', 'onderwijs', 'leren', 'lesgeven', 'haridus', 'õpi', 'õpetamine', 'koulutus', 'opi', 'opettaa', 'éducation', 'apprendre', 'enseigner', 'Bildung', 'lernen', 'Unterricht', 'oktatás', 'tanulj!', 'Tanárnő!', 'pendidikan', 'belajar', 'mengajar', 'istruzione', 'imparare', 'Insegna', 'educaţie', 'Învaţă', 'Predă', 'vzdelávanie', 'učiť sa', 'vyučovať', 'Educación', 'aprender', 'enseñar', 'utbildning', 'lära dig', 'lära ut']


In [9]:
scopus.iloc[:, 1].str.contains(
 "ducat|teach|learn|opvoeding|onderrig|educaci|enseny|vzdela|ucit se|uddanel|undervis|onderwijs|leren|lesgev|haridus|opetami|koulutus|opettaa|apprend|enseign|bildung|lernen|unterricht|oktatas|tanulj|tanarno|pendidikan|belajar|mengajar|istruzione|imparare|insenga|invata|vyuco|aprend|ensena|utbild|lara dig|lara ut|jiaoyu", 
 regex=True, case=False).sum()

1097

In [10]:
ed = 1097 / 41957
print(f"Education journals: {round(ed*100, 1)}%")

Education journals: 2.6%


In [11]:
print(scopus.columns[23:])
scopus.iloc[:, 23:].info()

Index(['Top level:\n\nLife Sciences', 'Top level:\n\nSocial Sciences',
 'Top level:\n\nPhysical Sciences', 'Top level:\n\nHealth Sciences',
 '1000 \nGeneral', '1100\nAgricultural and Biological Sciences',
 '1200\nArts and Humanities',
 '1300\nBiochemistry, Genetics and Molecular Biology',
 '1400\nBusiness, Management and Accounting',
 '1500\nChemical Engineering', '1600\nChemistry',
 '1700\nComputer Science', '1800\nDecision Sciences',
 '1900\nEarth and Planetary Sciences',
 '2000\nEconomics, Econometrics and Finance', '2100\nEnergy',
 '2200\nEngineering', '2300\nEnvironmental Science',
 '2400\nImmunology and Microbiology', '2500\nMaterials Science',
 '2600\nMathematics', '2700\nMedicine', '2800\nNeuroscience',
 '2900\nNursing', '3000\nPharmacology, Toxicology and Pharmaceutics',
 '3100\nPhysics and Astronomy', '3200\nPsychology',
 '3300\nSocial Sciences', '3400\nVeterinary', '3500\nDentistry',
 '3600\nHealth Professions'],
 dtype='object')

Int64Index: 41958 entries, 0 to 42473
Data c

In [12]:
scopus[(scopus["1700\nComputer Science"].notnull()) | 
 (scopus["2200\nEngineering"].notnull())].shape

(6752, 54)

In [13]:
csen = 6752 / 41957
print(f"% CS & Engineering journals: {round(csen*100, 1)}%")

% CS & Engineering journals: 16.1%


In [14]:
scopus[scopus["2600\nMathematics"].notnull()].shape

(1929, 54)

In [15]:
math = 1929 / 41957
print(f"% Math journals: {round(math*100, 1)}%")

% Math journals: 4.6%


In [16]:
medh = scopus[scopus["Top level:\n\nHealth Sciences"].notnull()].shape[0] / 41957
print(f"% Med-Health journals: {round(medh*100, 1)}%")

% Med-Health journals: 35.1%


### Use OpenAlex to try and disaggregate the "Social Sciences" journals:

In [17]:
e_socsci = scopus[(scopus["Top level:\n\nSocial Sciences"].notnull()) &
 (scopus["E-ISSN"].notnull())]["E-ISSN"]
e_socsci = list(zip(e_socsci.index, [str(issn)[:4] + "-" + str(issn)[4:] for issn in e_socsci]))

In [18]:
print_socsci = scopus[(scopus["Top level:\n\nSocial Sciences"].notnull()) &
 (scopus["Print-ISSN"].notnull())]["Print-ISSN"]
print_socsci = list(zip(print_socsci.index, [str(issn)[:4] + "-" + str(issn)[4:] for issn in print_socsci]))

In [19]:
ss_issns = e_socsci + print_socsci

In [20]:
def get_subjects(list_of_tuples):
 
 idx2subject = []
 error_issns = []
 
 for i, v in tqdm(list_of_tuples):
 query = "https://api.openalex.org/venues/issn:" + v
 
 try:
 response = json.loads(
 requests.get(query).content.decode()
 )
 subject = response["x_concepts"][0]["display_name"]
 except:
 error_issns.append(v)
 
 idx2subject.append(
 (i, subject)
 )
 
 return idx2subject, error_issns

In [21]:
idx2subject, errors = get_subjects(ss_issns)

100%|███████████████████████████████████| 19908/19908 [2:39:52<00:00, 2.08it/s]


In [22]:
with open(os.path.join("data", "idx2subject_ss.json"), "w") as outfile:
 json.dump(idx2subject, outfile)

In [23]:
print(set([t[1] for t in idx2subject]))

{'History', 'Ecology', 'Art', 'Linguistics', 'Genetics', 'Humanities', 'Common value auction', 'Business', 'Macroeconomics', 'Economics', 'Chemistry', 'Visual arts', 'Biology', 'Physics', 'Demographic economics', 'Finance', 'Materials science', 'Economic geography', 'Population', 'Computer science', 'Psychology', 'Law', 'Nanotechnology', 'Thermodynamics', 'Outbreak', 'Geology', 'Environmental science', 'Astronomy', 'Poison control', 'Geophysics', 'Political science', 'Medicine', 'Nursing', 'Mathematics', 'Geography', 'Archaeology', 'Philosophy', 'Engineering', 'Monetary policy', 'Sociology'}


In [25]:
d = {}
phil = 0
ling = 0
hum = 0

for idx, subject in idx2subject:
 
 if idx not in d:
 d[idx] = subject
 
 match subject:
 
 case "Philosophy":
 phil += 1
 
 case "Linguistics":
 ling += 1
 
 case "Humanities":
 hum += 1

print(f"% Philosophy journals: {round(phil / 41957 * 100, 1)}%")
print(f"% Linguistics journals: {round(ling / 41957 * 100, 1)}%")
print(f"% Humanities journals: {round(hum / 41957 * 100, 1)}%")
print(f"Phil: {phil}, Ling: {ling}, Hum: {hum}")

% Philosophy journals: 5.6%
% Linguistics journals: 0.0%
% Humanities journals: 0.1%
Phil: 2365, Ling: 3, Hum: 23


In [26]:
print(len(d))

13940


In [28]:
hist = 0
art = 0
visa = 0
soc = 0

for idx, subject in d.items():
 
 match subject:
 case "History":
 hist += 1
 case "Art":
 art += 1
 case "Visual arts":
 visa += 1
 case "Sociology":
 soc += 1
 
print(f"% History journals: {round(hist / 41957 * 100, 1)}%")
print(f"% Art + Visual Arts journals: {round((art+visa) / 41957 * 100, 1)}%")
print(f"% Sociology journals: {round(soc / 41957 * 100, 1)}%")
print(f"Hist: {hist}, Art: {art}, Vis Art: {visa}, Soc: {soc}")

% History journals: 1.9%
% Art + Visual Arts journals: 3.4%
% Sociology journals: 1.6%
Hist: 792, Art: 1437, Vis Art: 1, Soc: 656


### Final step: I need to give a rough estimate of the proportions of Scopus-indexed journals falling under the rubrics of "Language, communication, and culture" and "Philosophy and religion."

First, I will assume that a "Philosophy" classification from OpenAlex genuinely indicates a philosophy journal only if the journal is also classified by Scopus as "Arts and Humanities":

In [30]:
alex = scopus[scopus.index.isin(
 list(set([t[0] for t in idx2subject]))
)]
print(alex.shape)

(13940, 54)


In [32]:
phil_indices = [idx for idx, subject in d.items() if subject == "Philosophy"]
print(len(phil_indices))

2365


In [34]:
philn = alex[
 (alex.index.isin(phil_indices)) & (alex["1200\nArts and Humanities"].notnull())
].shape[0]
print(philn)
print(f"% Philosophy journals, double checked: {round(philn / 41957 * 100, 1)}")

1694
% Philosophy journals, double checked: 4.0


As for Scopus's "Arts and Humanities" journals which aren't labeled "Philosophy," "Arts," or "Visual arts" by OpenAlex, those provide a rough estimate for the number of journals in "Language, communication, and culture." That is to say, these are just "Humanities" journals:

In [41]:
art_indices = [idx for idx, subject in d.items() if subject == "Art" or subject == "Visual arts"]

In [42]:
print(len(art_indices))

1438


In [45]:
lcc = alex[
 (alex["1200\nArts and Humanities"].notnull()) &
 (~alex.index.isin(art_indices)) & 
 (~alex.index.isin(phil_indices))
].shape[0]
print(lcc)
print(f"% Language, communication, and culture journals, double checked: {round(lcc / 41957 * 100, 1)}")

2493
% Language, communication, and culture journals, double checked: 5.9
