"""
:mod:`dim` is an HTML parser and simple DOM implementation with CSS
selector support.
:mod:`dim`
- is a single module;
- has no dependency outside `PSL `_;
- is not crazy long;
- supports Python 3.6 and forward,
so the file could be directly embedded in any Python 3.4+ application,
or even in a monolithic source file. :mod:`dim` was designed to ease the
development of `googler(1) `_, which
itself promises to be a single Python script with zero third-party dep.
Simple example:
.. doctest::
>>> import dim
>>> html = '''
...
...
...
...
...
A
B
...
...
...
1
2
...
3
4
...
5
6
...
7
8
...
...
...
...
...
C
D
...
...
...
...
... '''
>>> root = dim.parse_html(html)
>>> [elem.text for elem in root.select_all('table#primary th.bold, '
... 'table#primary tr.highlight + tr > td.bold')]
['A', '3']
>>> [elem.text for elem in root.select_all('table#primary th.bold, '
... 'table#primary tr.highlight ~ tr > td.bold')]
['A', '3', '5', '7']
>>> [elem.text for elem in root.select_all('th.bold, tr.highlight ~ tr > td.bold')]
['A', '3', '5', '7', 'C']
"""
import html
import re
from collections import OrderedDict
from enum import Enum
from html.parser import HTMLParser
from typing import (
Any,
Dict,
Generator,
Iterable,
Iterator,
List,
Match,
Optional,
Sequence,
Tuple,
Union,
cast,
)
SelectorGroupLike = Union[str, "SelectorGroup", "Selector"]
class Node(object):
"""
Represents a DOM node.
Parts of JavaScript's DOM ``Node`` API and ``Element`` API are
mirrored here, with extensions. In particular, ``querySelector`` and
``querySelectorAll`` are mirrored.
Notable properties and methods: :meth:`attr()`, :attr:`classes`,
:attr:`html`, :attr:`text`, :meth:`ancestors()`,
:meth:`descendants()`, :meth:`select()`, :meth:`select_all()`,
:meth:`matched_by()`,
Attributes:
tag (:class:`Optional`\\[:class:`str`])
attrs (:class:`Dict`\\[:class:`str`, :class:`str`])
parent (:class:`Optional`\\[:class:`Node`])
children (:class:`List`\\[:class:`Node`])
"""
# Meant to be reimplemented by subclasses.
def __init__(self) -> None:
self.tag = None # type: Optional[str]
self.attrs = {} # type: Dict[str, str]
self.parent = None # type: Optional[Node]
self.children = [] # type: List[Node]
# Used in DOMBuilder.
self._partial = False
self._namespace = None # type: Optional[str]
# HTML representation of the node. Meant to be implemented by
# subclasses.
def __str__(self) -> str: # pragma: no cover
raise NotImplementedError
def select(self, selector: SelectorGroupLike) -> Optional["Node"]:
"""DOM ``querySelector`` clone. Returns one match (if any)."""
selector = self._normalize_selector(selector)
for node in self._select_all(selector):
return node
return None
def query_selector(self, selector: SelectorGroupLike) -> Optional["Node"]:
"""Alias of :meth:`select`."""
return self.select(selector)
def select_all(self, selector: SelectorGroupLike) -> List["Node"]:
"""DOM ``querySelectorAll`` clone. Returns all matches in a list."""
selector = self._normalize_selector(selector)
return list(self._select_all(selector))
def query_selector_all(self, selector: SelectorGroupLike) -> List["Node"]:
"""Alias of :meth:`select_all`."""
return self.select_all(selector)
def matched_by(
self, selector: SelectorGroupLike, root: Optional["Node"] = None
) -> bool:
"""
Checks whether this node is matched by `selector`.
See :meth:`SelectorGroup.matches()`.
"""
selector = self._normalize_selector(selector)
return selector.matches(self, root=root)
@staticmethod
def _normalize_selector(selector: SelectorGroupLike) -> "SelectorGroup":
if isinstance(selector, str):
return SelectorGroup.from_str(selector)
if isinstance(selector, SelectorGroup):
return selector
if isinstance(selector, Selector):
return SelectorGroup([selector])
raise ValueError("not a selector or group of selectors: %s" % repr(selector))
def _select_all(self, selector: "SelectorGroup") -> Generator["Node", None, None]:
for descendant in self.descendants():
if selector.matches(descendant, root=self):
yield descendant
def child_nodes(self) -> List["Node"]:
return self.children
def first_child(self) -> Optional["Node"]:
if self.children:
return self.children[0]
else:
return None
def first_element_child(self) -> Optional["Node"]:
for child in self.children:
if isinstance(child, ElementNode):
return child
return None
def last_child(self) -> Optional["Node"]:
if self.children:
return self.children[-1]
else:
return None
def last_element_child(self) -> Optional["Node"]:
for child in reversed(self.children):
if isinstance(child, ElementNode):
return child
return None
def next_sibling(self) -> Optional["Node"]:
""".. note:: Not O(1), use with caution."""
next_siblings = self.next_siblings()
if next_siblings:
return next_siblings[0]
else:
return None
def next_siblings(self) -> List["Node"]:
parent = self.parent
if not parent:
return []
try:
index = parent.children.index(self)
return parent.children[index + 1 :]
except ValueError: # pragma: no cover
raise ValueError("node is not found in children of its parent")
def next_element_sibling(self) -> Optional["ElementNode"]:
""".. note:: Not O(1), use with caution."""
for sibling in self.next_siblings():
if isinstance(sibling, ElementNode):
return sibling
return None
def previous_sibling(self) -> Optional["Node"]:
""".. note:: Not O(1), use with caution."""
previous_siblings = self.previous_siblings()
if previous_siblings:
return previous_siblings[0]
else:
return None
def previous_siblings(self) -> List["Node"]:
"""
Compared to the natural DOM order, the order of returned nodes
are reversed. That is, the adjacent sibling (if any) is the
first in the returned list.
"""
parent = self.parent
if not parent:
return []
try:
index = parent.children.index(self)
if index > 0:
return parent.children[index - 1 :: -1]
else:
return []
except ValueError: # pragma: no cover
raise ValueError("node is not found in children of its parent")
def previous_element_sibling(self) -> Optional["ElementNode"]:
""".. note:: Not O(1), use with caution."""
for sibling in self.previous_siblings():
if isinstance(sibling, ElementNode):
return sibling
return None
def ancestors(
self, *, root: Optional["Node"] = None
) -> Generator["Node", None, None]:
"""
Ancestors are generated in reverse order of depth, stopping at
`root`.
A :class:`RuntimeException` is raised if `root` is not in the
ancestral chain.
"""
if self is root:
return
ancestor = self.parent
while ancestor is not root:
if ancestor is None:
raise RuntimeError("provided root node not found in ancestral chain")
yield ancestor
ancestor = ancestor.parent
if root:
yield root
def descendants(self) -> Generator["Node", None, None]:
"""Descendants are generated in depth-first order."""
for child in self.children:
yield child
yield from child.descendants()
def attr(self, attr: str) -> Optional[str]:
"""Returns the attribute if it exists on the node, otherwise ``None``."""
return self.attrs.get(attr)
@property
def html(self) -> str:
"""
HTML representation of the node.
(For a :class:`TextNode`, :meth:`html` returns the escaped version of the
text.
"""
return str(self)
def outer_html(self) -> str:
"""Alias of :attr:`html`."""
return self.html
def inner_html(self) -> str:
"""HTML representation of the node's children."""
return "".join(child.html for child in self.children)
@property
def text(self) -> str: # pragma: no cover
"""This property is expected to be implemented by subclasses."""
raise NotImplementedError
def text_content(self) -> str:
"""Alias of :attr:`text`."""
return self.text
@property
def classes(self) -> List[str]:
return self.attrs.get("class", "").split()
def class_list(self) -> List[str]:
return self.classes
class ElementNode(Node):
"""
Represents an element node.
Note that tag and attribute names are case-insensitive; attribute
values are case-sensitive.
"""
def __init__(
self,
tag: str,
attrs: Iterable[Tuple[str, Optional[str]]],
*,
parent: Optional["Node"] = None,
children: Optional[Sequence["Node"]] = None
) -> None:
Node.__init__(self)
self.tag = tag.lower() # type: str
self.attrs = OrderedDict((attr.lower(), val or "") for attr, val in attrs)
self.parent = parent
self.children = list(children or [])
def __repr__(self) -> str:
s = "<" + self.tag
if self.attrs:
s += " attrs=%s" % repr(list(self.attrs.items()))
if self.children:
s += " children=%s" % repr(self.children)
s += ">"
return s
# https://ipython.readthedocs.io/en/stable/api/generated/IPython.lib.pretty.html
def _repr_pretty_(self, p: Any, cycle: bool) -> None: # pragma: no cover
if cycle:
raise RuntimeError("cycle detected in DOM tree")
p.text("<\x1b[1m%s\x1b[0m" % self.tag)
if self.attrs:
p.text(" attrs=%s" % repr(list(self.attrs.items())))
if self.children:
p.text(" children=[")
if len(self.children) == 1 and isinstance(self.first_child(), TextNode):
p.text("\x1b[4m%s\x1b[0m" % repr(self.first_child()))
else:
with p.indent(2):
for child in self.children:
p.break_()
if hasattr(child, "_repr_pretty_"):
child._repr_pretty_(p, False) # type: ignore
else:
p.text("\x1b[4m%s\x1b[0m" % repr(child))
p.text(",")
p.break_()
p.text("]")
p.text(">")
def __str__(self) -> str:
"""HTML representation of the node."""
s = "<" + self.tag
for attr, val in self.attrs.items():
s += ' %s="%s"' % (attr, html.escape(val))
if self.children:
s += ">"
s += "".join(str(child) for child in self.children)
s += "%s>" % self.tag
else:
if _tag_is_void(self.tag):
s += "/>"
else:
s += ">%s>" % self.tag
return s
@property
def text(self) -> str:
"""The concatenation of all descendant text nodes."""
return "".join(child.text for child in self.children)
class TextNode(str, Node):
"""
Represents a text node.
Subclasses :class:`Node` and :class:`str`.
"""
def __new__(cls, text: str) -> "TextNode":
s = str.__new__(cls, text) # type: ignore
s.parent = None
return s # type: ignore
def __init__(self, text: str) -> None:
Node.__init__(self)
def __repr__(self) -> str:
return "<%s>" % str.__repr__(self)
# HTML-escaped form of the text node. use text() for unescaped
# version.
def __str__(self) -> str:
return html.escape(self)
def __eq__(self, other: object) -> bool:
"""
Two text nodes are equal if and only if they are the same node.
For string comparison, use :attr:`text`.
"""
return self is other
def __ne__(self, other: object) -> bool:
"""
Two text nodes are non-equal if they are not the same node.
For string comparison, use :attr:`text`.
"""
return self is not other
@property
def text(self) -> str:
return str.__str__(self)
class DOMBuilderException(Exception):
"""
Exception raised when :class:`DOMBuilder` detects a bad state.
Attributes:
pos (:class:`Tuple`\\[:class:`int`, :class:`int`]):
Line number and offset in HTML input.
why (:class:`str`):
Reason of the exception.
"""
def __init__(self, pos: Tuple[int, int], why: str) -> None:
self.pos = pos
self.why = why
def __str__(self) -> str: # pragma: no cover
return "DOM builder aborted at %d:%d: %s" % (self.pos[0], self.pos[1], self.why)
class DOMBuilder(HTMLParser):
"""
HTML parser / DOM builder.
Subclasses :class:`html.parser.HTMLParser`.
Consume HTML and builds a :class:`Node` tree. Once finished, use
:attr:`root` to access the root of the tree.
This parser cannot parse malformed HTML with tag mismatch.
"""
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
# _stack is the stack for nodes. Each node is pushed to the
# stack when its start tag is processed, and remains on the
# stack until its parent node is completed (end tag processed),
# at which point the node is attached to the parent node as a
# child and popped from the stack.
self._stack = [] # type: List[Node]
# _namespace_stack is another stack tracking the parsing
# context, which is generally the default namespace (None) but
# changes when parsing foreign objects (e.g. 'svg' when parsing
# an