{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "78729" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"https://www.science.org/doi/10.1126/science.1199295\"\"\"\n", "\n", "import http.cookiejar\n", "import io\n", "import pathlib\n", "import re\n", "import shutil\n", "import subprocess\n", "import urllib.parse\n", "import urllib.request\n", "import zipfile\n", "\n", "DOI = '10.1126/science.1199295'\n", "\n", "PDF = pathlib.Path('1199295-atkinson-som.pdf')\n", "\n", "TXT = PDF.with_suffix('.txt')\n", "\n", "URL = 'https://www.science.org/action/downloadSupplement'\n", "\n", "URL = (urllib.parse.urlparse(URL)\n", " ._replace(query=urllib.parse.urlencode({'doi': DOI,\n", " 'file': PDF})))\n", "\n", "USER_AGENT = ('Mozilla/5.0 (X11; U; Linux i686)'\n", " ' Gecko/20071127 Firefox/2.0.0.11')\n", "\n", "ENCODING = 'utf-8'\n", "\n", "\n", "if not PDF.exists():\n", " cookies = http.cookiejar.CookieJar()\n", " processor = urllib.request.HTTPCookieProcessor(cookies)\n", " opener = urllib.request.build_opener(processor)\n", "\n", " request = urllib.request.Request(URL.geturl(),\n", " headers={'User-Agent': USER_AGENT})\n", " with opener.open(request) as u, PDF.open('wb') as f:\n", " shutil.copyfileobj(u, f)\n", "\n", "if not TXT.exists(): # requires one of popper-utils, miktex-poppler-bin, xpdf\n", " cmd = ['pdftotext', '-f', '21', '-l', '33', '-layout', '-nopgbrk', PDF, TXT]\n", " subprocess.run(cmd, check=True)\n", "\n", "_, _, S1 = TXT.read_text(encoding=ENCODING).strip().partition('\\n\\n\\n')\n", "\n", "assert ',' not in S1\n", "\n", "len(S1)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(12, 504)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "S1_HEAD = ['Language Name',\n", " 'WALS code',\n", " 'Family',\n", " 'Latitude',\n", " 'Longitude',\n", " 'Normalized Vowel Diversity',\n", " 'Normalized Consonant Diversity',\n", " 'Normalized Tone Diversity',\n", " 'Total Normalized Phoneme Diversity',\n", " 'ISO codes',\n", " 'Estimated Speaker Pop. Size',\n", " 'Distance from best fit origin']\n", "\n", "S1_FEAT = [h for h in S1_HEAD if h.startswith('Normalized ')]\n", "S1_OUTC = S1_HEAD[-2:]\n", "\n", "S1_DATA = S1[S1.index('\\nAbkhaz '):].strip()\n", "\n", "S1_DATA = re.sub(r'(\\))([a-z]{3} )', r'\\1 \\2', S1_DATA) # fix missing space\n", "S1_DATA = re.sub(r'\\n +([a-z]{3})\\n(.+?) ', r'\\n\\2 \\1 ', S1_DATA) # fix splitted lines\n", "S1_DATA = S1_DATA.replace('(San Mateo del Mar)', '(San Mateo Del Mar)') # simplify iso detection\n", "\n", "S1_DATA = [l.strip() for l in S1_DATA.splitlines()]\n", "\n", "\n", "len(S1_HEAD), len(S1_DATA)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Language Name,WALS code,Family,Latitude,Longitude,Normalized Vowel Diversity,Normalized Consonant Diversity,Normalized Tone Diversity,Total Normalized Phoneme Diversity,ISO codes,Estimated Speaker Pop. Size,Distance from best fit origin\\nAbkhaz,abk,Northwest Caucasian,43.08,41,-1.2345266,-1.5544112,-0.7687792,-1.185905651,abk,105952,5856.362\\nAcoma,aco,Keresan,34.92,-107.58,-0.4846364,-0.7169629,1.86204304,0.220147906,kjq,3391,18601.19\\nAndoke,adk,Andoke,-0.67,-72,1.3900889,-1.5544112,0.54663194,0.'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ROW = re.compile(r'''^\n", "(.+?)\\ +\n", "([a-z]{2,3})\\ +\n", "(.+?)\\ +\n", "(-?\\d+(?:\\.\\d+)?)\\ +\n", "(-?\\d+(?:\\.\\d+)?)\\ +\n", "(-?\\d+\\.\\d+)\\ +\n", "(-?\\d+\\.\\d+)\\ +\n", "(-?\\d+\\.\\d+)\\ +\n", "(-?\\d+\\.\\d+)\\ +\n", "(.+?)\\ +\n", "(\\d+)\\ +\n", "(\\d+(?:\\.\\d+)?)\n", "$''', flags=re.VERBOSE)\n", "\n", "\n", "csv = '\\n'.join(','.join(r) for r in [S1_HEAD] + [ROW.match(l).groups() for l in S1_DATA])\n", "\n", "csv[:500]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 504 entries, abk to zun\n", "Data columns (total 11 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Language Name 504 non-null object \n", " 1 Family 504 non-null object \n", " 2 Latitude 504 non-null float64\n", " 3 Longitude 504 non-null float64\n", " 4 Normalized Vowel Diversity 504 non-null float64\n", " 5 Normalized Consonant Diversity 504 non-null float64\n", " 6 Normalized Tone Diversity 504 non-null float64\n", " 7 Total Normalized Phoneme Diversity 504 non-null float64\n", " 8 ISO codes 504 non-null object \n", " 9 Estimated Speaker Pop. Size 504 non-null int64 \n", " 10 Distance from best fit origin 504 non-null float64\n", "dtypes: float64(7), int64(1), object(3)\n", "memory usage: 47.2+ KB\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Language NameFamilyLatitudeLongitudeNormalized Vowel DiversityNormalized Consonant DiversityNormalized Tone DiversityTotal Normalized Phoneme DiversityISO codesEstimated Speaker Pop. SizeDistance from best fit origin
WALS code
abkAbkhazNorthwest Caucasian43.0841.00-1.234527-1.554411-0.768779-1.185906abk1059525856.362
acoAcomaKeresan34.92-107.58-0.484636-0.7169631.8620430.220148kjq339118601.190
adkAndokeAndoke-0.67-72.001.390089-1.5544110.5466320.127437ano61923780.160
aeaAleut (Eastern)Eskimo-Aleut54.75-164.00-1.2345270.957934-0.768779-0.348457ale49014629.420
aegArabic (Egyptian)Afro-Asiatic30.0031.00-0.4846360.120485-0.768779-0.377643arz463210004153.443
\n", "
" ], "text/plain": [ " Language Name Family Latitude Longitude \\\n", "WALS code \n", "abk Abkhaz Northwest Caucasian 43.08 41.00 \n", "aco Acoma Keresan 34.92 -107.58 \n", "adk Andoke Andoke -0.67 -72.00 \n", "aea Aleut (Eastern) Eskimo-Aleut 54.75 -164.00 \n", "aeg Arabic (Egyptian) Afro-Asiatic 30.00 31.00 \n", "\n", " Normalized Vowel Diversity Normalized Consonant Diversity \\\n", "WALS code \n", "abk -1.234527 -1.554411 \n", "aco -0.484636 -0.716963 \n", "adk 1.390089 -1.554411 \n", "aea -1.234527 0.957934 \n", "aeg -0.484636 0.120485 \n", "\n", " Normalized Tone Diversity Total Normalized Phoneme Diversity \\\n", "WALS code \n", "abk -0.768779 -1.185906 \n", "aco 1.862043 0.220148 \n", "adk 0.546632 0.127437 \n", "aea -0.768779 -0.348457 \n", "aeg -0.768779 -0.377643 \n", "\n", " ISO codes Estimated Speaker Pop. Size \\\n", "WALS code \n", "abk abk 105952 \n", "aco kjq 3391 \n", "adk ano 619 \n", "aea ale 490 \n", "aeg arz 46321000 \n", "\n", " Distance from best fit origin \n", "WALS code \n", "abk 5856.362 \n", "aco 18601.190 \n", "adk 23780.160 \n", "aea 14629.420 \n", "aeg 4153.443 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%matplotlib inline\n", "\n", "import itertools\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import scipy.stats\n", "\n", "\n", "def pearsonr(df, left, right, func=scipy.stats.pearsonr):\n", " df = df[[left, right]].dropna()\n", " name = '%s & %s' % (left, right)\n", " return pd.Series(func(df[left], df[right]), index=('r', 'p'), name=name)\n", "\n", "\n", "with io.StringIO(csv) as f:\n", " s1 = pd.read_csv(f, na_values='', keep_default_na=False, index_col='WALS code')\n", "\n", "s1.info()\n", "assert s1.index.is_unique\n", "assert s1.index.is_monotonic_increasing\n", "s1.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rp
Estimated Speaker Pop. Size & Normalized Vowel Diversity0.031530.48006
Estimated Speaker Pop. Size & Normalized Consonant Diversity0.089690.04415
Estimated Speaker Pop. Size & Normalized Tone Diversity0.036510.41345
Estimated Speaker Pop. Size & Total Normalized Phoneme Diversity0.082190.06523
Distance from best fit origin & Normalized Vowel Diversity-0.394210.00000
Distance from best fit origin & Normalized Consonant Diversity-0.260140.00000
Distance from best fit origin & Normalized Tone Diversity-0.390630.00000
Distance from best fit origin & Total Normalized Phoneme Diversity-0.544660.00000
\n", "
" ], "text/plain": [ " r p\n", "Estimated Speaker Pop. Size & Normalized Vowel ... 0.03153 0.48006\n", "Estimated Speaker Pop. Size & Normalized Conson... 0.08969 0.04415\n", "Estimated Speaker Pop. Size & Normalized Tone D... 0.03651 0.41345\n", "Estimated Speaker Pop. Size & Total Normalized ... 0.08219 0.06523\n", "Distance from best fit origin & Normalized Vowe... -0.39421 0.00000\n", "Distance from best fit origin & Normalized Cons... -0.26014 0.00000\n", "Distance from best fit origin & Normalized Tone... -0.39063 0.00000\n", "Distance from best fit origin & Total Normalize... -0.54466 0.00000" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PAIRS = list(itertools.product(S1_OUTC, S1_FEAT + ['Total Normalized Phoneme Diversity']))\n", "\n", "\n", "pd.DataFrame([pearsonr(s1, x, y) for x, y in PAIRS]).round(5)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rp
Population & Total Normalized Phoneme Diversity0.384540.0
\n", "
" ], "text/plain": [ " r p\n", "Population & Total Normalized Phoneme Diversity 0.38454 0.0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "s1.plot.scatter(x='Estimated Speaker Pop. Size', y='Total Normalized Phoneme Diversity', logx=True);\n", "\n", "(pearsonr(s1.assign(Population=s1['Estimated Speaker Pop. Size'].apply(np.log10)),\n", " 'Population', 'Total Normalized Phoneme Diversity')\n", " .to_frame().T.round(5))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rp
Distance & Total Normalized Phoneme Diversity-0.544660.0
\n", "
" ], "text/plain": [ " r p\n", "Distance & Total Normalized Phoneme Diversity -0.54466 0.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "s1.plot.scatter(x='Distance from best fit origin', y='Total Normalized Phoneme Diversity');\n", "\n", "(pearsonr(s1.assign(Distance=s1['Distance from best fit origin']),\n", " 'Distance', 'Total Normalized Phoneme Diversity')\n", " .to_frame().T.round(5))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 2679 entries, aab to zzo\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name 2679 non-null object \n", " 1 latitude 2679 non-null float64\n", " 2 longitude 2679 non-null float64\n", " 3 family 2679 non-null object \n", " 4 1A Consonant Inventories 563 non-null object \n", " 5 2A Vowel Quality Inventories 564 non-null object \n", " 6 13A Tone 527 non-null object \n", "dtypes: float64(2), object(5)\n", "memory usage: 167.4+ KB\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Namelatitudelongitudefamily1A Consonant Inventories2A Vowel Quality Inventories13A Tone
wals_code
aabArapesh (Abu)-3.450000142.950000TorricelliNaNNaNNaN
aarAari6.00000036.583333Afro-AsiaticNaNNaNNaN
abaAbau-4.000000141.250000SepikNaNNaNNaN
abbArabic (Chadian)13.83333320.833333Afro-AsiaticNaNNaNNaN
abdAbidji5.666667-4.583333Niger-CongoNaNNaNNaN
\n", "
" ], "text/plain": [ " Name latitude longitude family \\\n", "wals_code \n", "aab Arapesh (Abu) -3.450000 142.950000 Torricelli \n", "aar Aari 6.000000 36.583333 Afro-Asiatic \n", "aba Abau -4.000000 141.250000 Sepik \n", "abb Arabic (Chadian) 13.833333 20.833333 Afro-Asiatic \n", "abd Abidji 5.666667 -4.583333 Niger-Congo \n", "\n", " 1A Consonant Inventories 2A Vowel Quality Inventories 13A Tone \n", "wals_code \n", "aab NaN NaN NaN \n", "aar NaN NaN NaN \n", "aba NaN NaN NaN \n", "abb NaN NaN NaN \n", "abd NaN NaN NaN " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "URL = 'http://cdstar.shh.mpg.de/bitstreams/EAEA0-7269-77E5-3E10-0/wals_language.csv.zip'\n", "\n", "ARCHIVE = pathlib.Path(URL.rpartition('/')[2])\n", "\n", "EXTRACT = 'language.csv'\n", "\n", "INFO = ['wals_code', 'Name', 'family', 'latitude', 'longitude']\n", "\n", "FEAT = ['2A Vowel Quality Inventories', '1A Consonant Inventories', '13A Tone']\n", "\n", "\n", "if not ARCHIVE.exists():\n", " urllib.request.urlretrieve(URL, ARCHIVE)\n", "\n", "with zipfile.ZipFile(ARCHIVE) as archive:\n", " wf = pd.read_csv(archive.open(EXTRACT), encoding=ENCODING,\n", " na_values='', keep_default_na=False,\n", " index_col=INFO[0], usecols=INFO + FEAT)\n", "\n", "wf.info()\n", "assert wf.index.is_unique\n", "wf.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 567 entries, abi to zun\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name 567 non-null object \n", " 1 latitude 567 non-null float64\n", " 2 longitude 567 non-null float64\n", " 3 family 567 non-null object \n", " 4 1A Consonant Inventories 563 non-null object \n", " 5 2A Vowel Quality Inventories 564 non-null object \n", " 6 13A Tone 527 non-null object \n", "dtypes: float64(2), object(5)\n", "memory usage: 35.4+ KB\n" ] } ], "source": [ "wf = wf.dropna(how='all', subset=FEAT)\n", "\n", "wf.info()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 504 entries, abk to zun\n", "Data columns (total 18 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Language Name 504 non-null object \n", " 1 Family 504 non-null object \n", " 2 Latitude 504 non-null float64\n", " 3 Longitude 504 non-null float64\n", " 4 Normalized Vowel Diversity 504 non-null float64\n", " 5 Normalized Consonant Diversity 504 non-null float64\n", " 6 Normalized Tone Diversity 504 non-null float64\n", " 7 Total Normalized Phoneme Diversity 504 non-null float64\n", " 8 ISO codes 504 non-null object \n", " 9 Estimated Speaker Pop. Size 504 non-null int64 \n", " 10 Distance from best fit origin 504 non-null float64\n", " 11 Name 504 non-null object \n", " 12 latitude 504 non-null float64\n", " 13 longitude 504 non-null float64\n", " 14 family 504 non-null object \n", " 15 1A Consonant Inventories 504 non-null object \n", " 16 2A Vowel Quality Inventories 504 non-null object \n", " 17 13A Tone 504 non-null object \n", "dtypes: float64(9), int64(1), object(8)\n", "memory usage: 91.0+ KB\n" ] } ], "source": [ "df = s1.join(wf)\n", "\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Namelatitudelongitudefamily1A Consonant Inventories2A Vowel Quality Inventories13A Tone
abiAbipón-29.000000-61.000000Guaicuruan2 Moderately small2 Average (5-6)NaN
abmAlabama32.333333-87.416667Muskogean1 Small1 Small (2-4)NaN
achAché-25.250000-55.166667Tupian1 Small2 Average (5-6)NaN
acmAchumawi41.500000-121.000000Hokan2 Moderately small2 Average (5-6)2 Simple tone system
adzAdzera-6.250000146.250000Austronesian2 Moderately small1 Small (2-4)NaN
\n", "
" ], "text/plain": [ " Name latitude longitude family 1A Consonant Inventories \\\n", "abi Abipón -29.000000 -61.000000 Guaicuruan 2 Moderately small \n", "abm Alabama 32.333333 -87.416667 Muskogean 1 Small \n", "ach Aché -25.250000 -55.166667 Tupian 1 Small \n", "acm Achumawi 41.500000 -121.000000 Hokan 2 Moderately small \n", "adz Adzera -6.250000 146.250000 Austronesian 2 Moderately small \n", "\n", " 2A Vowel Quality Inventories 13A Tone \n", "abi 2 Average (5-6) NaN \n", "abm 1 Small (2-4) NaN \n", "ach 2 Average (5-6) NaN \n", "acm 2 Average (5-6) 2 Simple tone system \n", "adz 1 Small (2-4) NaN " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wf.loc[wf.index.difference(s1.index)].head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mismatches
2A Vowel Quality Inventories1
1A Consonant Inventories86
13A Tone0
\n", "
" ], "text/plain": [ " mismatches\n", "2A Vowel Quality Inventories 1\n", "1A Consonant Inventories 86\n", "13A Tone 0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "COLS = list(itertools.chain.from_iterable(zip(S1_FEAT, FEAT)))\n", "\n", "MAP = dict(zip(S1_FEAT, FEAT))\n", "\n", "\n", "_category = pd.api.types.CategoricalDtype(ordered=True)\n", "\n", "df[COLS] = df[COLS].apply(lambda x: x.astype(_category).cat.codes)\n", "\n", "mism = df[S1_FEAT].rename(columns=MAP) != df[FEAT]\n", "\n", "mism.sum().to_frame('mismatches')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Normalized Vowel Diversity2A Vowel Quality InventoriesNormalized Consonant Diversity1A Consonant InventoriesNormalized Tone Diversity13A Tone
WALS code
abk000400
aco111422
aea003200
aeg112300
agh223211
\n", "
" ], "text/plain": [ " Normalized Vowel Diversity 2A Vowel Quality Inventories \\\n", "WALS code \n", "abk 0 0 \n", "aco 1 1 \n", "aea 0 0 \n", "aeg 1 1 \n", "agh 2 2 \n", "\n", " Normalized Consonant Diversity 1A Consonant Inventories \\\n", "WALS code \n", "abk 0 4 \n", "aco 1 4 \n", "aea 3 2 \n", "aeg 2 3 \n", "agh 3 2 \n", "\n", " Normalized Tone Diversity 13A Tone \n", "WALS code \n", "abk 0 0 \n", "aco 2 2 \n", "aea 0 0 \n", "aeg 0 0 \n", "agh 1 1 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[mism.any(axis=1), COLS].head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-4 1\n", "-3 7\n", "-2 10\n", "-1 21\n", " 0 418\n", " 1 33\n", " 2 9\n", " 3 4\n", " 4 1\n", "Name: count, dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diff = df['Normalized Consonant Diversity'] - df['1A Consonant Inventories']\n", "\n", "diff.value_counts().sort_index()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "nf = df[diff != 0].assign(diff=diff)\n", "\n", "nf['diff'].sort_values().plot.bar(figsize=(15, 3));" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rp
diff & Estimated Speaker Pop. Size0.0763650.484652
diff & Distance from best fit origin0.4170430.000065
\n", "
" ], "text/plain": [ " r p\n", "diff & Estimated Speaker Pop. Size 0.076365 0.484652\n", "diff & Distance from best fit origin 0.417043 0.000065" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame([pearsonr(nf, 'diff', x) for x in\n", " ['Estimated Speaker Pop. Size', 'Distance from best fit origin']])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "nf.plot.scatter(x='Distance from best fit origin', y='diff');" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rp
Diversity & Estimated Speaker Pop. Size0.066180.1379
Diversity & Distance from best fit origin-0.588840.0000
\n", "
" ], "text/plain": [ " r p\n", "Diversity & Estimated Speaker Pop. Size 0.06618 0.1379\n", "Diversity & Distance from best fit origin -0.58884 0.0000" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nf = df[FEAT + S1_OUTC + ['Total Normalized Phoneme Diversity']].copy()\n", "\n", "nf[FEAT] = nf[FEAT].apply(scipy.stats.zscore)\n", "\n", "nf['Diversity'] = nf[FEAT].mean(axis=1)\n", "\n", "pd.DataFrame([pearsonr(nf, 'Diversity', x) for x in\n", " ['Estimated Speaker Pop. Size', 'Distance from best fit origin']]).round(5)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rp
Population & Diversity0.413790.0
\n", "
" ], "text/plain": [ " r p\n", "Population & Diversity 0.41379 0.0" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(pearsonr(nf.assign(Population=s1['Estimated Speaker Pop. Size'].apply(np.log10)),\n", " 'Population', 'Diversity')\n", " .to_frame().T.round(5))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rp
Distance & Diversity-0.588840.0
\n", "
" ], "text/plain": [ " r p\n", "Distance & Diversity -0.58884 0.0" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(pearsonr(nf.assign(Distance=s1['Distance from best fit origin']),\n", " 'Distance', 'Diversity')\n", " .to_frame().T.round(5))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 4 }