import { ctype_alpha as ctypeAlpha } from '../ctype/ctype_alpha.ts' export function str_word_count( str: string, format?: 0 | 1 | 2, charlist?: string, ): number | string[] | { [key: number]: string } { // discuss at: https://locutus.io/php/str_word_count/ // original by: Ole Vrijenhoek // bugfixed by: Kevin van Zonneveld (https://kvz.io) // bugfixed by: Brett Zamir (https://brett-zamir.me) // bugfixed by: Brett Zamir (https://brett-zamir.me) // input by: Bug? // improved by: Brett Zamir (https://brett-zamir.me) // example 1: str_word_count("Hello fri3nd, you're\r\n looking good today!", 1) // returns 1: ['Hello', 'fri', 'nd', "you're", 'looking', 'good', 'today'] // example 2: str_word_count("Hello fri3nd, you're\r\n looking good today!", 2) // returns 2: {0: 'Hello', 6: 'fri', 10: 'nd', 14: "you're", 29: 'looking', 46: 'good', 51: 'today'} // example 3: str_word_count("Hello fri3nd, you're\r\n looking good today!", 1, '\u00e0\u00e1\u00e3\u00e73') // returns 3: ['Hello', 'fri3nd', "you're", 'looking', 'good', 'today'] // example 4: str_word_count('hey', 2) // returns 4: {0: 'hey'} const len = str.length const cl = charlist?.length ?? 0 let chr = '' let tmpStr = '' let c = '' const wArr: string[] = [] let wC = 0 const assoc: { [key: number]: string } = {} let aC = 0 let reg: RegExp | null = null let match = false const _pregQuote = function (value: string): string { return (value + '').replace(/([\\.+*?[^\]$(){}=!<>|:])/g, '\\$1') } const _getWholeChar = function (value: string, index: number): string | false { // Use for rare cases of non-BMP characters const code = value.charCodeAt(index) if (code < 0xd800 || code > 0xdfff) { return value.charAt(index) } if (code >= 0xd800 && code <= 0xdbff) { // High surrogate (could change last hex to 0xDB7F to treat high private surrogates as single // characters) if (value.length <= index + 1) { throw new Error('High surrogate without following low surrogate') } const next = value.charCodeAt(index + 1) if (next < 0xdc00 || next > 0xdfff) { throw new Error('High surrogate without following low surrogate') } return value.charAt(index) + value.charAt(index + 1) } // Low surrogate (0xDC00 <= code && code <= 0xDFFF) if (index === 0) { throw new Error('Low surrogate without preceding high surrogate') } const prev = value.charCodeAt(index - 1) if (prev < 0xd800 || prev > 0xdbff) { // (could change last hex to 0xDB7F to treat high private surrogates as single characters) throw new Error('Low surrogate without preceding high surrogate') } // We can pass over low surrogates now as the second component in a pair which we have already // processed return false } if (cl && typeof charlist === 'string') { const firstChar = _getWholeChar(charlist, 0) let pattern = '^(' + _pregQuote(firstChar === false ? '' : firstChar) for (let i = 1; i < cl; i++) { const wholeChar = _getWholeChar(charlist, i) if (wholeChar === false) { continue } chr = wholeChar pattern += '|' + _pregQuote(chr) } pattern += ')$' reg = new RegExp(pattern) } for (let i = 0; i < len; i++) { const wholeChar = _getWholeChar(str, i) if (wholeChar === false) { continue } c = wholeChar // No hyphen at beginning or end unless allowed in charlist (or locale) // No apostrophe at beginning unless allowed in charlist (or locale) // @todo: Make this more readable match = ctypeAlpha(c) || (reg !== null && reg.test(c)) || (i !== 0 && i !== len - 1 && c === '-') || (i !== 0 && c === "'") if (match) { if (tmpStr === '' && format === 2) { aC = i } tmpStr = tmpStr + c } if (i === len - 1 || (!match && tmpStr !== '')) { if (format !== 2) { wArr.push(tmpStr) } else { assoc[aC] = tmpStr } tmpStr = '' wC++ } } if (!format) { return wC } else if (format === 1) { return wArr } else if (format === 2) { return assoc } throw new Error('You have supplied an incorrect format') }