import fs from 'node:fs/promises'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { XMLParser } from 'fast-xml-parser'; import sortKeys from 'sort-keys'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); type WikiNewsFeed = { feed: { doc: { abstract: string; }[]; }; }; /* Steps to add support for new subsets: 1. Identify source from WikiNews data: - Previously sourced from mirror: https://wikidata.aerotechnet.com/ - Find folder `wikinews`, e.g. `enwikinews` - Find latest date folder, e.g. `20231201` 2. Download & extract `abstracts` data file: - Filename should be `wikinews--abstract.xml`, e.g. `enwikinews-20231201-abstract.xml` 3. Add extracted XML file to `abstracts` folder 4. Add subset name to `Subset` type below (see https://www.utf8icons.com/subsets for reference on Unicode Range names) 5. Add mapping to `languageToSubset` below, for resolving the language from the filename to the correct subset 6. Add Unicode Range to `unicodeRanges` below, for filtering characters to count their frequency 7. Run `pnpm dev` to generate the weightings, local dev refs and metrics */ type Subset = 'latin' | 'thai'; const languageToSubset: Record = { en: 'latin', th: 'thai', }; const unicodeRanges: Record = { // latin and latin-1 supplement (excluding control characters) latin: ['\u0020-\u007F', '\u00A0-\u00FF'], // thai (excluding diacritic marks/combining characters) thai: ['\u0E01-\u0E30', '\u0E32-\u0E33', '\u0E3F-\u0E46', '\u0E4F-\u0E5B'], }; const abstractFileNamePattern = /([a-z]{2})wikinews-[\d]+-abstract\.xml/; const SAMPLE_SIZE = 5000; (async () => { const directoryPath = path.join(__dirname, 'abstracts'); const languages = await fs.readdir(directoryPath); let weightings: Record> = {}; await Promise.all( languages.map(async function (filename) { const languageMatch = filename.match(abstractFileNamePattern); if (!languageMatch) { throw new Error( `Filename did not match expected source pattern: ${filename}`, ); } const language = languageMatch[1]; const subset = languageToSubset[language]; if (!subset) { throw new Error(`Could not find subset for language: ${language}`); } const content = await fs.readFile( path.join(directoryPath, filename), 'utf-8', ); const parser = new XMLParser(); const json: WikiNewsFeed = parser.parse(content); const data = json.feed.doc .slice(1, SAMPLE_SIZE) // remove markdown link syntax .map(({ abstract }) => abstract.replace(/\[(.[^\|]+)(?:\|.[^\]]+)?\]/g, '$1'), ) // remove some common wikiquote syntax .filter((a) => a !== '__NOTOC__' && a !== '----' && a !== '-----') .join(''); let rawTotal = 0; const charOccurenceCount: Record = {}; const charRegex = new RegExp(`[${unicodeRanges[subset].join('')}]`); for (const char of data) { if (charRegex.test(char)) { charOccurenceCount[char] = (charOccurenceCount[char] ?? 0) + 1; rawTotal += 1; } } let filteredTotal = 0; const output = Object.entries(charOccurenceCount) .map(([char, count]) => ({ char, count, weighting: parseFloat((count / rawTotal).toFixed(4)), })) // Filter out zero weightings below 4 decimal places .filter(({ count, weighting }) => { if (weighting > 0) { // sum the filtered total for updated weightings filteredTotal += count; return true; } return false; }) // Calculate updated weightings, depends on filtered total, so must be done after .map(({ char, count }) => [ char, parseFloat((count / filteredTotal).toFixed(4)), ]); weightings[subset] = Object.fromEntries(output); }), ); await fs.writeFile( path.join(__dirname, '../src/weightings.ts'), `export default ${JSON.stringify(sortKeys(weightings), null, 2)}\n`, 'utf-8', ); })();