Barnaby Walters'); * echo json_encode($output, JSON_PRETTY_PRINT); * * Produces: * * { * "items": [ * { * "type": ["h-card"], * "properties": { * "name": ["Barnaby Walters"] * } * } * ], * "rels": {} * } * * @param string|DOMDocument $input The HTML string or DOMDocument object to parse * @param string $url The URL the input document was found at, for relative URL resolution * @param bool $convertClassic whether or not to convert classic microformats * @return array Canonical MF2 array structure */ function parse($input, $url = null, $convertClassic = true) { $parser = new Parser($input, $url); return $parser->parse($convertClassic); } /** * Fetch microformats2 * * Given a URL, fetches it (following up to 5 redirects) and, if the content-type appears to be HTML, returns the parsed * microformats2 array structure. * * Not that even if the response code was a 4XX or 5XX error, if the content-type is HTML-like then it will be parsed * all the same, as there are legitimate cases where error pages might contain useful microformats (for example a deleted * h-entry resulting in a 410 Gone page with a stub h-entry explaining the reason for deletion). Look in $curlInfo['http_code'] * for the actual value. * * @param string $url The URL to fetch * @param bool $convertClassic (optional, default true) whether or not to convert classic microformats * @param &array $curlInfo (optional) the results of curl_getinfo will be placed in this variable for debugging * @return array|null canonical microformats2 array structure on success, null on failure */ function fetch($url, $convertClassic = true, &$curlInfo=null) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_MAXREDIRS, 5); curl_setopt($ch, CURLOPT_HTTPHEADER, array( 'Accept: text/html' )); $html = curl_exec($ch); $info = $curlInfo = curl_getinfo($ch); curl_close($ch); if (strpos(strtolower($info['content_type']), 'html') === false) { // The content was not delivered as HTML, do not attempt to parse it. return null; } # ensure the final URL is used to resolve relative URLs $url = $info['url']; return parse($html, $url, $convertClassic); } /** * Unicode to HTML Entities * @param string $input String containing characters to convert into HTML entities * @return string */ function unicodeToHtmlEntities($input) { return mb_convert_encoding($input, 'HTML-ENTITIES', mb_detect_encoding($input)); } /** * Collapse Whitespace * * Collapses any sequences of whitespace within a string into a single space * character. * * @deprecated since v0.2.3 * @param string $str * @return string */ function collapseWhitespace($str) { return preg_replace('/[\s|\n]+/', ' ', $str); } function unicodeTrim($str) { // this is cheating. TODO: find a better way if this causes any problems $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str); $str = preg_replace('/^\s+/', '', $str); return preg_replace('/\s+$/', '', $str); } /** * Microformat Name From Class string * * Given the value of @class, get the relevant mf classnames (e.g. h-card, * p-name). * * @param string $class A space delimited list of classnames * @param string $prefix The prefix to look for * @return string|array The prefixed name of the first microfomats class found or false */ function mfNamesFromClass($class, $prefix='h-') { $class = str_replace(array(' ', ' ', "\n"), ' ', $class); $classes = explode(' ', $class); $classes = preg_grep('#^(h|p|u|dt|e)-([a-z0-9]+-)?[a-z]+(-[a-z]+)*$#', $classes); $matches = array(); foreach ($classes as $classname) { $compare_classname = ' ' . $classname; $compare_prefix = ' ' . $prefix; if (strstr($compare_classname, $compare_prefix) !== false && ($compare_classname != $compare_prefix)) { $matches[] = ($prefix === 'h-') ? $classname : substr($classname, strlen($prefix)); } } return $matches; } /** * Registered with the XPath object and used within XPaths for finding root elements. * @param string $class * @return bool */ function classHasMf2RootClassname($class) { return count(mfNamesFromClass($class, 'h-')) > 0; } /** * Get Nested µf Property Name From Class * * Returns all the p-, u-, dt- or e- prefixed classnames it finds in a * space-separated string. * * @param string $class * @return array */ function nestedMfPropertyNamesFromClass($class) { $prefixes = array('p-', 'u-', 'dt-', 'e-'); $propertyNames = array(); foreach ($prefixes as $prefix) { $classes = mfNamesFromClass($class, $prefix); foreach ($classes as $property) { $propertyNames[$property][] = $prefix; } } foreach ($propertyNames as $property => $prefixes) { $propertyNames[$property] = array_unique($prefixes); } return $propertyNames; } /** * Wraps mfNamesFromClass to handle an element as input (common) * * @param DOMElement $e The element to get the classname for * @param string $prefix The prefix to look for * @return mixed See return value of mf2\Parser::mfNameFromClass() */ function mfNamesFromElement(\DOMElement $e, $prefix = 'h-') { $class = $e->getAttribute('class'); return mfNamesFromClass($class, $prefix); } /** * Wraps nestedMfPropertyNamesFromClass to handle an element as input */ function nestedMfPropertyNamesFromElement(\DOMElement $e) { $class = $e->getAttribute('class'); return nestedMfPropertyNamesFromClass($class); } /** * Converts various time formats to HH:MM * @param string $time The time to convert * @return string */ function convertTimeFormat($time) { $hh = $mm = $ss = ''; preg_match('/(\d{1,2}):?(\d{2})?:?(\d{2})?(a\.?m\.?|p\.?m\.?)?/i', $time, $matches); // If no am/pm is specified: if (empty($matches[4])) { return $time; } else { // Otherwise, am/pm is specified. $meridiem = strtolower(str_replace('.', '', $matches[4])); // Hours. $hh = $matches[1]; // Add 12 to hours if pm applies. if ($meridiem == 'pm' && ($hh < 12)) { $hh += 12; } $hh = str_pad($hh, 2, '0', STR_PAD_LEFT); // Minutes. $mm = (empty($matches[2]) ) ? '00' : $matches[2]; // Seconds, only if supplied. if (!empty($matches[3])) { $ss = $matches[3]; } if (empty($ss)) { return sprintf('%s:%s', $hh, $mm); } else { return sprintf('%s:%s:%s', $hh, $mm, $ss); } } } /** * Normalize an ordinal date to YYYY-MM-DD * This function should only be called after validating the $dtValue * matches regex \d{4}-\d{2} * @param string $dtValue * @return string */ function normalizeOrdinalDate($dtValue) { list($year, $day) = explode('-', $dtValue, 2); $day = intval($day); if ($day < 367 && $day > 0) { $date = \DateTime::createFromFormat('Y-z', $dtValue); $date->modify('-1 day'); # 'z' format is zero-based so need to adjust if ($date->format('Y') === $year) { return $date->format('Y-m-d'); } } return ''; } /** * If a date value has a timezone offset, normalize it. * @param string $dtValue * @return string isolated, normalized TZ offset for implied TZ for other dt- properties */ function normalizeTimezoneOffset(&$dtValue) { preg_match('/Z|[+-]\d{1,2}:?(\d{2})?$/i', $dtValue, $matches); if (empty($matches)) { return null; } $timezoneOffset = null; if ( $matches[0] != 'Z' ) { $timezoneString = str_replace(':', '', $matches[0]); $plus_minus = substr($timezoneString, 0, 1); $timezoneOffset = substr($timezoneString, 1); if ( strlen($timezoneOffset) <= 2 ) { $timezoneOffset .= '00'; } $timezoneOffset = str_pad($timezoneOffset, 4, 0, STR_PAD_LEFT); $timezoneOffset = $plus_minus . $timezoneOffset; $dtValue = preg_replace('/Z?[+-]\d{1,2}:?(\d{2})?$/i', $timezoneOffset, $dtValue); } return $timezoneOffset; } function applySrcsetUrlTransformation($srcset, $transformation) { return implode(', ', array_filter(array_map(function ($srcsetPart) use ($transformation) { $parts = explode(" \t\n\r\0\x0B", trim($srcsetPart), 2); $parts[0] = rtrim($parts[0]); if (empty($parts[0])) { return false; } $parts[0] = call_user_func($transformation, $parts[0]); return $parts[0] . (empty($parts[1]) ? '' : ' ' . $parts[1]); }, explode(',', trim($srcset))))); } /** * Microformats2 Parser * * A class which holds state for parsing microformats2 from HTML. * * Example usage: * * use Mf2; * $parser = new Mf2\Parser('

Barnaby Walters

'); * $output = $parser->parse(); */ class Parser { /** @var string The baseurl (if any) to use for this parse */ public $baseurl; /** @var DOMXPath object which can be used to query over any fragment*/ public $xpath; /** @var DOMDocument */ public $doc; /** @var SplObjectStorage */ protected $parsed; /** * @var bool */ public $jsonMode; /** @var boolean Whether to include experimental language parsing in the result */ public $lang = false; /** @var bool Whether to include alternates object (dropped from spec in favor of rel-urls) */ public $enableAlternates = false; /** * Elements upgraded to mf2 during backcompat * @var SplObjectStorage */ protected $upgraded; /** * Whether to convert classic microformats * @var bool */ public $convertClassic; /** * Constructor * * @param DOMDocument|string $input The data to parse. A string of HTML or a DOMDocument * @param string $url The URL of the parsed document, for relative URL resolution * @param boolean $jsonMode Whether or not to use a stdClass instance for an empty `rels` dictionary. This breaks PHP looping over rels, but allows the output to be correctly serialized as JSON. */ public function __construct($input, $url = null, $jsonMode = false) { $emptyDocDefault = ''; libxml_use_internal_errors(true); if (is_string($input)) { if (empty($input)) { $input = $emptyDocDefault; } if (class_exists('Masterminds\\HTML5')) { $doc = new \Masterminds\HTML5(array('disable_html_ns' => true)); $doc = $doc->loadHTML($input); } else { $doc = new DOMDocument(); @$doc->loadHTML(unicodeToHtmlEntities($input), \LIBXML_NOWARNING); } } elseif (is_a($input, 'DOMDocument')) { $doc = clone $input; } else { $doc = new DOMDocument(); @$doc->loadHTML($emptyDocDefault); } // Create an XPath object and allow some PHP functions to be used within XPath queries. $this->xpath = new DOMXPath($doc); $this->xpath->registerNamespace('php', 'http://php.net/xpath'); $this->xpath->registerPhpFunctions('\\Mf2\\classHasMf2RootClassname'); $baseurl = $url; foreach ($this->xpath->query('//base[@href]') as $base) { $baseElementUrl = $base->getAttribute('href'); if (parse_url($baseElementUrl, PHP_URL_SCHEME) === null) { /* The base element URL is relative to the document URL. * * :/ * * Perhaps the author was high? */ $baseurl = resolveUrl($url, $baseElementUrl); } else { $baseurl = $baseElementUrl; } break; } // Ignore