flesch_kincaid_reading_ease($text); */ class TextStatistics { protected $strEncoding = ''; // Used to hold character encoding to be used by object, if set /** * Constructor. * * @param string $strEncoding Optional character encoding. * @return void */ public function __construct($strEncoding = '') { if ($strEncoding <> '') { // Encoding is given. Use it! $this->strEncoding = $strEncoding; } } /** * Gives the Flesch-Kincaid Reading Ease of text entered rounded to one digit * @param strText Text to be checked */ function flesch_kincaid_reading_ease($strText) { $strText = $this->clean_text($strText); return round((206.835 - (1.015 * $this->average_words_per_sentence($strText)) - (84.6 * $this->average_syllables_per_word($strText))), 1); } /** * Gives the Flesch-Kincaid Grade level of text entered rounded to one digit * @param strText Text to be checked */ function flesch_kincaid_grade_level($strText) { $strText = $this->clean_text($strText); return round(((0.39 * $this->average_words_per_sentence($strText)) + (11.8 * $this->average_syllables_per_word($strText)) - 15.59), 1); } /** * Gives the Gunning-Fog score of text entered rounded to one digit * @param strText Text to be checked */ public function gunning_fog_score($strText) { $strText = $this->clean_text($strText); return round((($this->average_words_per_sentence($strText) + $this->percentage_words_with_three_syllables($strText, false)) * 0.4), 1); } /** * Gives the Coleman-Liau Index of text entered rounded to one digit * @param strText Text to be checked */ public function coleman_liau_index($strText) { $strText = $this->clean_text($strText); //return round( ( (5.89 * ($this->letter_count($strText) / $this->word_count($strText))) - (0.3 * ($this->sentence_count($strText) / $this->word_count($strText))) - 15.8 ), 1); // 0.3 -> 29.5 (simplified formula by ackerman!) return round( ( (5.89 * ($this->letter_count($strText) / $this->word_count($strText))) - (29.5 * ($this->sentence_count($strText) / $this->word_count($strText))) - 15.8 ), 1); } /** * Gives the SMOG Index of text entered rounded to one digit * @param strText Text to be checked */ public function smog_index($strText) { $strText = $this->clean_text($strText); return round(1.043 * sqrt(($this->words_with_three_syllables($strText) * (30 / $this->sentence_count($strText))) + 3.1291), 1); } /** * Gives the Automated Readability Index of text entered rounded to one digit * @param strText Text to be checked */ public function automated_readability_index($strText) { $strText = $this->clean_text($strText); return round(((4.71 * ($this->letter_count($strText) / $this->word_count($strText))) + (0.5 * ($this->word_count($strText) / $this->sentence_count($strText))) - 21.43), 1); } /** * Gives string length. Tries mb_strlen and if that fails uses regular strlen. * @param strText Text to be measured */ public function text_length($strText) { $intTextLength = 0; try { if ($this->strEncoding == '') { $intTextLength = mb_strlen($strText); } else { $intTextLength = mb_strlen($strText, $this->strEncoding); } } catch (Exception $e) { $intTextLength = strlen($strText); } return $intTextLength; } /** * Gives letter count (ignores all non-letters). Tries mb_strlen and if that fails uses regular strlen. * @param strText Text to be measured */ public function letter_count($strText) { $strText = $this->clean_text($strText); // To clear out newlines etc $intTextLength = 0; $strText = preg_replace('/[^A-Za-z]+/', '', $strText); try { if ($this->strEncoding == '') { $intTextLength = mb_strlen($strText); } else { $intTextLength = mb_strlen($strText, $this->strEncoding); } } catch (Exception $e) { $intTextLength = strlen($strText); } return $intTextLength; } /** * Trims, removes line breaks, multiple spaces and generally cleans text before processing. * @param strText Text to be transformed */ protected function clean_text($strText) { // all these tags should be preceeded by a full stop. $fullStopTags = array('li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dd'); foreach ($fullStopTags as $tag) { $strText = str_ireplace('', '.', $strText); } $strText = strip_tags($strText); $strText = preg_replace('/[,:;()-]/', ' ', $strText); // Replace commans, hyphens etc (count them as spaces) $strText = preg_replace('/[\.!?]/', '.', $strText); // Unify terminators $strText = trim($strText) . '.'; // Add final terminator, just in case it's missing. $strText = preg_replace('/[ ]*(\n|\r\n|\r)[ ]*/', ' ', $strText); // Replace new lines with spaces $strText = preg_replace('/([\.])[\. ]+/', '$1', $strText); // Check for duplicated terminators $strText = trim(preg_replace('/[ ]*([\.])/', '$1 ', $strText)); // Pad sentence terminators $strText = preg_replace('/[ ]+/', ' ', $strText); // Remove multiple spaces $strText = preg_replace_callback('/\. [^ ]+/', create_function('$matches', 'return strtolower($matches[0]);'), $strText); // Lower case all words following terminators (for gunning fog score) return $strText; } /** * Converts string to lower case. Tries mb_strtolower and if that fails uses regular strtolower. * @param strText Text to be transformed */ protected function lower_case($strText) { $strLowerCaseText = ''; try { if ($this->strEncoding == '') { $strLowerCaseText = mb_strtolower($strText); } else { $strLowerCaseText = mb_strtolower($strText, $this->strEncoding); } } catch (Exception $e) { $strLowerCaseText = strtolower($strText); } return $strLowerCaseText; } /** * Converts string to upper case. Tries mb_strtoupper and if that fails uses regular strtoupper. * @param strText Text to be transformed */ protected function upper_case($strText) { $strUpperCaseText = ''; try { if ($this->strEncoding == '') { $strUpperCaseText = mb_strtoupper($strText); } else { $strUpperCaseText = mb_strtoupper($strText, $this->strEncoding); } } catch (Exception $e) { $strUpperCaseText = strtoupper($strText); } return $strUpperCaseText; } /** * Gets portion of string. Tries mb_substr and if that fails uses regular substr. * @param strText Text to be cut up * @param intStart Start character * @param intLenght Length */ protected function substring($strText, $intStart, $intLength) { $strSubstring = ''; try { if ($this->strEncoding == '') { $strSubstring = mb_substr($strText, $intStart, $intLength); } else { $strSubstring = mb_substr($strText, $intStart, $intLength, $this->strEncoding); } } catch (Exception $e) { $strSubstring = substr($strText, $intStart, $intLength); } return $strSubstring; } /** * Returns sentence count for text. * @param strText Text to be measured */ public function sentence_count($strText) { $strText = $this->clean_text($strText); // Will be tripped up by "Mr." or "U.K.". Not a major concern at this point. $intSentences = max(1, $this->text_length(preg_replace('/[^\.!?]/', '', $strText))); return $intSentences; } /** * Returns word count for text. * @param strText Text to be measured */ public function word_count($strText) { $strText = $this->clean_text($strText); // Will be tripped by by em dashes with spaces either side, among other similar characters $intWords = 1 + $this->text_length(preg_replace('/[^ ]/', '', $strText)); // Space count + 1 is word count return $intWords; } /** * Returns average words per sentence for text. * @param strText Text to be measured */ public function average_words_per_sentence($strText) { $strText = $this->clean_text($strText); $intSentenceCount = $this->sentence_count($strText); $intWordCount = $this->word_count($strText); return ($intWordCount / $intSentenceCount); } /** * Returns average syllables per word for text. * @param strText Text to be measured */ public function average_syllables_per_word($strText) { $strText = $this->clean_text($strText); $intSyllableCount = 0; $intWordCount = $this->word_count($strText); $arrWords = explode(' ', $strText); for ($i = 0; $i < $intWordCount; $i++) { $intSyllableCount += $this->syllable_count($arrWords[$i]); } return ($intSyllableCount / $intWordCount); } /** * Returns the number of words with more than three syllables * @param strText Text to be measured * @param blnCountProperNouns Boolean - should proper nouns be included in words count */ public function words_with_three_syllables($strText, $blnCountProperNouns = true) { $strText = $this->clean_text($strText); $intLongWordCount = 0; $intWordCount = $this->word_count($strText); $arrWords = explode(' ', $strText); for ($i = 0; $i < $intWordCount; $i++) { if ($this->syllable_count($arrWords[$i]) > 2) { if ($blnCountProperNouns) { $intLongWordCount++; } else { $strFirstLetter = $this->substring($arrWords[$i], 0, 1); if ($strFirstLetter !== $this->upper_case($strFirstLetter)) { // First letter is lower case. Count it. $intLongWordCount++; } } } } return ($intLongWordCount); } /** * Returns the percentage of words with more than three syllables * @param strText Text to be measured * @param blnCountProperNouns Boolean - should proper nouns be included in words count */ public function percentage_words_with_three_syllables($strText, $blnCountProperNouns = true) { $strText = $this->clean_text($strText); $intWordCount = $this->word_count($strText); $intLongWordCount = $this->words_with_three_syllables($strText, $blnCountProperNouns); $intPercentage = (($intLongWordCount / $intWordCount) * 100); return ($intPercentage); } /** * Returns the number of syllables in the word. * Based in part on Greg Fast's Perl module Lingua::EN::Syllables * @param strWord Word to be measured */ public function syllable_count($strWord) { $intSyllableCount = 0; $strWord = $this->lower_case($strWord); // Specific common exceptions that don't follow the rule set below are handled individually // Array of problem words (with word as key, syllable count as value) $arrProblemWords = Array( 'simile' => 3 ,'forever' => 3 ,'shoreline' => 2 ); if (isset($arrProblemWords[$strWord])) { $intSyllableCount = $arrProblemWords[$strWord]; } if ($intSyllableCount > 0) { return $intSyllableCount; } // These syllables would be counted as two but should be one $arrSubSyllables = Array( 'cial' ,'tia' ,'cius' ,'cious' ,'giu' ,'ion' ,'iou' ,'sia$' ,'[^aeiuoyt]{2,}ed$' ,'.ely$' ,'[cg]h?e[rsd]?$' ,'rved?$' ,'[aeiouy][dt]es?$' ,'[aeiouy][^aeiouydt]e[rsd]?$' ,'^[dr]e[aeiou][^aeiou]+$' // Sorts out deal, deign etc ,'[aeiouy]rse$' // Purse, hearse ); // These syllables would be counted as one but should be two $arrAddSyllables = Array( 'ia' ,'riet' ,'dien' ,'iu' ,'io' ,'ii' ,'[aeiouym]bl$' ,'[aeiou]{3}' ,'^mc' ,'ism$' ,'([^aeiouy])\1l$' ,'[^l]lien' ,'^coa[dglx].' ,'[^gq]ua[^auieo]' ,'dnt$' ,'uity$' ,'ie(r|st)$' ); // Single syllable prefixes and suffixes $arrPrefixSuffix = Array( '/^un/' ,'/^fore/' ,'/ly$/' ,'/less$/' ,'/ful$/' ,'/ers?$/' ,'/ings?$/' ); // Remove prefixes and suffixes and count how many were taken $strWord = preg_replace($arrPrefixSuffix, '', $strWord, -1, $intPrefixSuffixCount); // Removed non-word characters from word $strWord = preg_replace('/[^a-z]/is', '', $strWord); $arrWordParts = preg_split('/[^aeiouy]+/', $strWord); $intWordPartCount = 0; foreach ($arrWordParts as $strWordPart) { if ($strWordPart <> '') { $intWordPartCount++; } } // Some syllables do not follow normal rules - check for them // Thanks to Joe Kovar for correcting a bug in the following lines $intSyllableCount = $intWordPartCount + $intPrefixSuffixCount; foreach ($arrSubSyllables as $strSyllable) { $intSyllableCount -= preg_match('~' . $strSyllable . '~', $strWord); } foreach ($arrAddSyllables as $strSyllable) { $intSyllableCount += preg_match('~' . $strSyllable . '~', $strWord); } $intSyllableCount = ($intSyllableCount == 0) ? 1 : $intSyllableCount; return $intSyllableCount; } } ?>