{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.0 64-bit ('3.8.0-amd64': pyenv)",
"metadata": {
"interpreter": {
"hash": "2334294e789e06453e89f7d5c3c2573679701f9bb5b76ad2158f91cca5c9244c"
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"source": [
" Τμήμα Πληροφορικής και Τηλεπικοινωνιών - Άρτα \n",
" Πανεπιστήμιο Ιωαννίνων \n",
"\n",
" Γκόγκος Χρήστος \n",
" http://chgogos.github.io/\n",
" Εαρινό εξάμηνο 2020-2021"
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"# XML \n",
"\n",
"minidom και ElementTree (standard library)\n",
"\n",
"xmltodict (3rd party library)"
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"## Parsing με το minidom"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"['Attr', 'AttributeList', 'CDATASection', 'CharacterData', 'Childless', 'Comment', 'DOMImplementation', 'DOMImplementationLS', 'Document', 'DocumentFragment', 'DocumentLS', 'DocumentType', 'EMPTY_NAMESPACE', 'EMPTY_PREFIX', 'Element', 'ElementInfo', 'EmptyNodeList', 'Entity', 'Identified', 'NamedNodeMap', 'Node', 'NodeList', 'Notation', 'ProcessingInstruction', 'ReadOnlySequentialNamedNodeMap', 'StringTypes', 'Text', 'TypeInfo', 'XMLNS_NAMESPACE', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_append_child', '_clear_id_cache', '_clone_node', '_do_pulldom_parse', '_get_containing_element', '_get_containing_entref', '_get_elements_by_tagName_helper', '_get_elements_by_tagName_ns_helper', '_in_document', '_no_type', '_nodeTypes_with_children', '_nssplit', '_set_attribute_node', '_write_data', 'defproperty', 'domreg', 'getDOMImplementation', 'io', 'parse', 'parseString', 'xml']\n"
]
}
],
"source": [
"import xml.dom.minidom\n",
"print(dir(xml.dom.minidom))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n\n\t\n \n\t\n\t\t\n \n\t\tΆρτα\n\t\t\n \n\t\t45221\n\t\t\n \n\t\tΉπειρος\n\t\t\n \n\t\tΑνεξαρτησίας 33\n\t\t\n \n\t\n\t\n \n\t30\n\t\n \n\tΓιάννης\n\t\n \n\tΠαπαδόπουλος\n\t\n \n\ttrue\n\t\n \n\t\n\t\t\n \n\t\t\n\t\t\t\n \n\t\t\t00302681123456\n\t\t\t\n \n\t\t\thome\n\t\t\t\n \n\t\t\n\t\t\n \n\t\t\n\t\t\t\n \n\t\t\t00302681654321\n\t\t\t\n \n\t\t\twork\n\t\t\t\n \n\t\t\n\t\t\n \n\t\t\n\t\t\t\n \n\t\t\t00306971234567\n\t\t\t\n \n\t\t\tmobile\n\t\t\t\n \n\t\t\n\t\t\n \n\t\n\t\n \n\t\n\t\n\n\n\n"
]
}
],
"source": [
"# parsing του XML, εκτύπωση\n",
"\n",
"path = \"../../../datasets/person.xml\"\n",
"xml_f = open(path, \"r\", encoding=\"utf-8\")\n",
"xmlparse = xml.dom.minidom.parseString(xml_f.read())\n",
"prettyxml = xmlparse.toprettyxml()\n",
"print(prettyxml)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'Παπαδόπουλος'"
]
},
"metadata": {},
"execution_count": 3
}
],
"source": [
"# εμφάνιση της πληροφορίας lastName ως XML\n",
"\n",
"xmlparse.getElementsByTagName('lastName')[0].toxml()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Παπαδόπουλος\nΠαπαδόπουλος\n"
]
}
],
"source": [
"# εμφάνιση της πληροφορίας lastName\n",
"\n",
"print(xmlparse.getElementsByTagName('lastName')[0].firstChild.data)\n",
"print(xmlparse.getElementsByTagName('lastName')[0].firstChild.nodeValue) # το nodeValue είναι ψευδώνυμο για το data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"city: Άρτα\npostalCode: 45221\nstate: Ήπειρος\nstreetAddress: Ανεξαρτησίας 33\n"
]
}
],
"source": [
"# εμφάνιση όλων των εμφωλευμένων πληροφοριών στην ετικέτα address\n",
"\n",
"for node in xmlparse.getElementsByTagName('address')[0].childNodes:\n",
" if node.nodeType == node.ELEMENT_NODE:\n",
" print(f'{node.tagName}: {node.firstChild.data}')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Phone1\nnumber: 00302681123456\ntype: home\n##############################\nPhone2\nnumber: 00302681654321\ntype: work\n##############################\nPhone3\nnumber: 00306971234567\ntype: mobile\n##############################\n"
]
}
],
"source": [
"for node in xmlparse.getElementsByTagName('phoneNumbers'):\n",
" i=0\n",
" for node2 in node.childNodes:\n",
" if node2.nodeType == node.ELEMENT_NODE:\n",
" i+=1\n",
" print(f'Phone{i}')\n",
" for node3 in node2.childNodes:\n",
" if node3.nodeType == node.ELEMENT_NODE:\n",
" print(f'{node3.tagName}: {node3.firstChild.data}')\n",
" print(\"#\"*30)\n",
" "
]
},
{
"source": [
"## Parsing με το ElementTree"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n##############################\ncity Άρτα\npostalCode 45221\nstate Ήπειρος\nstreetAddress Ανεξαρτησίας 33\nage 30\nfirstName Γιάννης\nlastName Παπαδόπουλος\nmarried true\nnumber 00302681123456\ntype home\nnumber 00302681654321\ntype work\nnumber 00306971234567\ntype mobile\nemail None\n"
]
}
],
"source": [
"# διάσχιση XML χρησιμοποιώντας τη συνάρτηση list(elem)\n",
"\n",
"import xml.etree.ElementTree as ET \n",
"\n",
"path = \"../../../datasets/person.xml\"\n",
"xml_f = open(path, \"r\", encoding=\"utf-8\")\n",
"tree = ET.ElementTree(file=xml_f)\n",
"\n",
"root = tree.getroot()\n",
"print(root)\n",
"print(\"#\" * 30)\n",
"\n",
"for elem in list(root):\n",
" if elem.tag == 'address': \n",
" for elem2 in list(elem):\n",
" print(elem2.tag, elem2.text)\n",
" elif elem.tag == 'phoneNumbers':\n",
" for elem2 in list(elem):\n",
" print(list(elem2)[0].tag, list(elem2)[0].text)\n",
" print(list(elem2)[1].tag, list(elem2)[1].text)\n",
" else:\n",
" print(elem.tag, elem.text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"ADDRESS\ncity Άρτα\npostalCode 45221\nstate Ήπειρος\nstreetAddress Ανεξαρτησίας 33\nage 30\nfirstName Γιάννης\nlastName Παπαδόπουλος\nmarried true\nPHONENUMBERS\nnumber 00302681123456\ntype home\nnumber 00302681654321\ntype work\nnumber 00306971234567\ntype mobile\nemail None\n"
]
}
],
"source": [
"# διάσχιση XML με iterator \n",
"\n",
"for elem in tree.iter():\n",
" if elem.tag == 'root':\n",
" continue\n",
" elif elem.tag == 'address':\n",
" print('ADDRESS')\n",
" elif elem.tag == 'phoneNumbers':\n",
" print('PHONENUMBERS')\n",
" elif elem.tag == 'element':\n",
" continue\n",
" else: \n",
" print(f'{elem.tag} {elem.text}')"
]
},
{
"source": [
"### Εγγραφή δεδομένων σε αρχείο XML με το ElementTree\n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# εγγραφή αρχείου XML που περιέχει attributes και elements\n",
"\n",
"import xml.etree.ElementTree as ET \n",
"\n",
"xml_doc = ET.Element(\"messages\")\n",
"note1 = ET.SubElement(xml_doc, \"note\")\n",
"note1.set('id', '501')\n",
"note1_to = ET.SubElement(note1, \"to\")\n",
"note1_to.text = 'Tove'\n",
"note1_from = ET.SubElement(note1, \"from\")\n",
"note1_from.text = 'Jani'\n",
"note1_heading = ET.SubElement(note1, \"heading\")\n",
"note1_heading.text = 'Reminder'\n",
"note1_body = ET.SubElement(note1, \"body\")\n",
"note1_body.text = \"Don't forget me the weekend!\"\n",
"\n",
"note2 = ET.SubElement(xml_doc, \"note\")\n",
"note2.set('id', '502')\n",
"note2_to = ET.SubElement(note2, \"to\")\n",
"note2_to.text = 'Jani'\n",
"note2_from = ET.SubElement(note2, \"from\")\n",
"note2_from.text = 'Tove'\n",
"note2_heading = ET.SubElement(note2, \"heading\")\n",
"note2_heading.text = 'Re: Reminder'\n",
"note2_body = ET.SubElement(note2, \"body\")\n",
"note2_body.text = \"I will not\"\n",
"\n",
"# https://stackoverflow.com/questions/749796/pretty-printing-xml-in-python\n",
"def prettify(element, indent=\" \"):\n",
" queue = [(0, element)] # (level, element)\n",
" while queue:\n",
" level, element = queue.pop(0)\n",
" children = [(level + 1, child) for child in list(element)]\n",
" if children:\n",
" element.text = \"\\n\" + indent * (level + 1) # for child open\n",
" if queue:\n",
" element.tail = \"\\n\" + indent * queue[0][0] # for sibling open\n",
" else:\n",
" element.tail = \"\\n\" + indent * (level - 1) # for parent close\n",
" queue[0:0] = children # prepend so children come before siblings\n",
"\n",
"prettify(xml_doc)\n",
"tree = ET.ElementTree(xml_doc)\n",
"tree.write('../../../datasets/notes.xml')"
]
},
{
"source": [
"## xmltodict\n",
"\n",
"Μετατρέπει XML δεδομένα σε λεξικό\n",
"\n",
" $ pip install xmltodict"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"OrderedDict([('root',\n",
" OrderedDict([('address',\n",
" OrderedDict([('city', 'Άρτα'),\n",
" ('postalCode', '45221'),\n",
" ('state', 'Ήπειρος'),\n",
" ('streetAddress',\n",
" 'Ανεξαρτησίας 33')])),\n",
" ('age', '30'),\n",
" ('firstName', 'Γιάννης'),\n",
" ('lastName', 'Παπαδόπουλος'),\n",
" ('married', 'true'),\n",
" ('phoneNumbers',\n",
" OrderedDict([('element',\n",
" [OrderedDict([('number',\n",
" '00302681123456'),\n",
" ('type', 'home')]),\n",
" OrderedDict([('number',\n",
" '00302681654321'),\n",
" ('type', 'work')]),\n",
" OrderedDict([('number',\n",
" '00306971234567'),\n",
" ('type',\n",
" 'mobile')])])])),\n",
" ('email', OrderedDict([('@null', 'true')]))]))])"
]
},
"metadata": {},
"execution_count": 10
}
],
"source": [
"import xmltodict\n",
"\n",
"path = \"../../../datasets/person.xml\"\n",
"xml_f = open(path, \"r\", encoding=\"utf-8\")\n",
"\n",
"xmldict = xmltodict.parse(xml_f.read())\n",
"xmldict"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Παπαδόπουλος\nΆρτα\n00302681123456\n"
]
}
],
"source": [
"print(xmldict['root']['lastName'])\n",
"print(xmldict['root']['address']['city'])\n",
"print(xmldict['root']['phoneNumbers']['element'][0]['number'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
]
}