{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" }, "orig_nbformat": 2, "kernelspec": { "name": "python3", "display_name": "Python 3.8.0 64-bit ('3.8.0-amd64': pyenv)", "metadata": { "interpreter": { "hash": "2334294e789e06453e89f7d5c3c2573679701f9bb5b76ad2158f91cca5c9244c" } } } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "source": [ " Τμήμα Πληροφορικής και Τηλεπικοινωνιών - Άρτα \n", " Πανεπιστήμιο Ιωαννίνων \n", "\n", " Γκόγκος Χρήστος \n", " http://chgogos.github.io/\n", " Εαρινό εξάμηνο 2020-2021" ], "cell_type": "markdown", "metadata": {} }, { "source": [ "# XML \n", "\n", "minidom και ElementTree (standard library)\n", "\n", "xmltodict (3rd party library)" ], "cell_type": "markdown", "metadata": {} }, { "source": [ "## Parsing με το minidom" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['Attr', 'AttributeList', 'CDATASection', 'CharacterData', 'Childless', 'Comment', 'DOMImplementation', 'DOMImplementationLS', 'Document', 'DocumentFragment', 'DocumentLS', 'DocumentType', 'EMPTY_NAMESPACE', 'EMPTY_PREFIX', 'Element', 'ElementInfo', 'EmptyNodeList', 'Entity', 'Identified', 'NamedNodeMap', 'Node', 'NodeList', 'Notation', 'ProcessingInstruction', 'ReadOnlySequentialNamedNodeMap', 'StringTypes', 'Text', 'TypeInfo', 'XMLNS_NAMESPACE', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_append_child', '_clear_id_cache', '_clone_node', '_do_pulldom_parse', '_get_containing_element', '_get_containing_entref', '_get_elements_by_tagName_helper', '_get_elements_by_tagName_ns_helper', '_in_document', '_no_type', '_nodeTypes_with_children', '_nssplit', '_set_attribute_node', '_write_data', 'defproperty', 'domreg', 'getDOMImplementation', 'io', 'parse', 'parseString', 'xml']\n" ] } ], "source": [ "import xml.dom.minidom\n", "print(dir(xml.dom.minidom))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n\n\t\n \n\t
\n\t\t\n \n\t\tΆρτα\n\t\t\n \n\t\t45221\n\t\t\n \n\t\tΉπειρος\n\t\t\n \n\t\tΑνεξαρτησίας 33\n\t\t\n \n\t
\n\t\n \n\t30\n\t\n \n\tΓιάννης\n\t\n \n\tΠαπαδόπουλος\n\t\n \n\ttrue\n\t\n \n\t\n\t\t\n \n\t\t\n\t\t\t\n \n\t\t\t00302681123456\n\t\t\t\n \n\t\t\thome\n\t\t\t\n \n\t\t\n\t\t\n \n\t\t\n\t\t\t\n \n\t\t\t00302681654321\n\t\t\t\n \n\t\t\twork\n\t\t\t\n \n\t\t\n\t\t\n \n\t\t\n\t\t\t\n \n\t\t\t00306971234567\n\t\t\t\n \n\t\t\tmobile\n\t\t\t\n \n\t\t\n\t\t\n \n\t\n\t\n \n\t\n\t\n\n
\n\n" ] } ], "source": [ "# parsing του XML, εκτύπωση\n", "\n", "path = \"../../../datasets/person.xml\"\n", "xml_f = open(path, \"r\", encoding=\"utf-8\")\n", "xmlparse = xml.dom.minidom.parseString(xml_f.read())\n", "prettyxml = xmlparse.toprettyxml()\n", "print(prettyxml)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'Παπαδόπουλος'" ] }, "metadata": {}, "execution_count": 3 } ], "source": [ "# εμφάνιση της πληροφορίας lastName ως XML\n", "\n", "xmlparse.getElementsByTagName('lastName')[0].toxml()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Παπαδόπουλος\nΠαπαδόπουλος\n" ] } ], "source": [ "# εμφάνιση της πληροφορίας lastName\n", "\n", "print(xmlparse.getElementsByTagName('lastName')[0].firstChild.data)\n", "print(xmlparse.getElementsByTagName('lastName')[0].firstChild.nodeValue) # το nodeValue είναι ψευδώνυμο για το data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "city: Άρτα\npostalCode: 45221\nstate: Ήπειρος\nstreetAddress: Ανεξαρτησίας 33\n" ] } ], "source": [ "# εμφάνιση όλων των εμφωλευμένων πληροφοριών στην ετικέτα address\n", "\n", "for node in xmlparse.getElementsByTagName('address')[0].childNodes:\n", " if node.nodeType == node.ELEMENT_NODE:\n", " print(f'{node.tagName}: {node.firstChild.data}')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Phone1\nnumber: 00302681123456\ntype: home\n##############################\nPhone2\nnumber: 00302681654321\ntype: work\n##############################\nPhone3\nnumber: 00306971234567\ntype: mobile\n##############################\n" ] } ], "source": [ "for node in xmlparse.getElementsByTagName('phoneNumbers'):\n", " i=0\n", " for node2 in node.childNodes:\n", " if node2.nodeType == node.ELEMENT_NODE:\n", " i+=1\n", " print(f'Phone{i}')\n", " for node3 in node2.childNodes:\n", " if node3.nodeType == node.ELEMENT_NODE:\n", " print(f'{node3.tagName}: {node3.firstChild.data}')\n", " print(\"#\"*30)\n", " " ] }, { "source": [ "## Parsing με το ElementTree" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n##############################\ncity Άρτα\npostalCode 45221\nstate Ήπειρος\nstreetAddress Ανεξαρτησίας 33\nage 30\nfirstName Γιάννης\nlastName Παπαδόπουλος\nmarried true\nnumber 00302681123456\ntype home\nnumber 00302681654321\ntype work\nnumber 00306971234567\ntype mobile\nemail None\n" ] } ], "source": [ "# διάσχιση XML χρησιμοποιώντας τη συνάρτηση list(elem)\n", "\n", "import xml.etree.ElementTree as ET \n", "\n", "path = \"../../../datasets/person.xml\"\n", "xml_f = open(path, \"r\", encoding=\"utf-8\")\n", "tree = ET.ElementTree(file=xml_f)\n", "\n", "root = tree.getroot()\n", "print(root)\n", "print(\"#\" * 30)\n", "\n", "for elem in list(root):\n", " if elem.tag == 'address': \n", " for elem2 in list(elem):\n", " print(elem2.tag, elem2.text)\n", " elif elem.tag == 'phoneNumbers':\n", " for elem2 in list(elem):\n", " print(list(elem2)[0].tag, list(elem2)[0].text)\n", " print(list(elem2)[1].tag, list(elem2)[1].text)\n", " else:\n", " print(elem.tag, elem.text)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "ADDRESS\ncity Άρτα\npostalCode 45221\nstate Ήπειρος\nstreetAddress Ανεξαρτησίας 33\nage 30\nfirstName Γιάννης\nlastName Παπαδόπουλος\nmarried true\nPHONENUMBERS\nnumber 00302681123456\ntype home\nnumber 00302681654321\ntype work\nnumber 00306971234567\ntype mobile\nemail None\n" ] } ], "source": [ "# διάσχιση XML με iterator \n", "\n", "for elem in tree.iter():\n", " if elem.tag == 'root':\n", " continue\n", " elif elem.tag == 'address':\n", " print('ADDRESS')\n", " elif elem.tag == 'phoneNumbers':\n", " print('PHONENUMBERS')\n", " elif elem.tag == 'element':\n", " continue\n", " else: \n", " print(f'{elem.tag} {elem.text}')" ] }, { "source": [ "### Εγγραφή δεδομένων σε αρχείο XML με το ElementTree\n" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# εγγραφή αρχείου XML που περιέχει attributes και elements\n", "\n", "import xml.etree.ElementTree as ET \n", "\n", "xml_doc = ET.Element(\"messages\")\n", "note1 = ET.SubElement(xml_doc, \"note\")\n", "note1.set('id', '501')\n", "note1_to = ET.SubElement(note1, \"to\")\n", "note1_to.text = 'Tove'\n", "note1_from = ET.SubElement(note1, \"from\")\n", "note1_from.text = 'Jani'\n", "note1_heading = ET.SubElement(note1, \"heading\")\n", "note1_heading.text = 'Reminder'\n", "note1_body = ET.SubElement(note1, \"body\")\n", "note1_body.text = \"Don't forget me the weekend!\"\n", "\n", "note2 = ET.SubElement(xml_doc, \"note\")\n", "note2.set('id', '502')\n", "note2_to = ET.SubElement(note2, \"to\")\n", "note2_to.text = 'Jani'\n", "note2_from = ET.SubElement(note2, \"from\")\n", "note2_from.text = 'Tove'\n", "note2_heading = ET.SubElement(note2, \"heading\")\n", "note2_heading.text = 'Re: Reminder'\n", "note2_body = ET.SubElement(note2, \"body\")\n", "note2_body.text = \"I will not\"\n", "\n", "# https://stackoverflow.com/questions/749796/pretty-printing-xml-in-python\n", "def prettify(element, indent=\" \"):\n", " queue = [(0, element)] # (level, element)\n", " while queue:\n", " level, element = queue.pop(0)\n", " children = [(level + 1, child) for child in list(element)]\n", " if children:\n", " element.text = \"\\n\" + indent * (level + 1) # for child open\n", " if queue:\n", " element.tail = \"\\n\" + indent * queue[0][0] # for sibling open\n", " else:\n", " element.tail = \"\\n\" + indent * (level - 1) # for parent close\n", " queue[0:0] = children # prepend so children come before siblings\n", "\n", "prettify(xml_doc)\n", "tree = ET.ElementTree(xml_doc)\n", "tree.write('../../../datasets/notes.xml')" ] }, { "source": [ "## xmltodict\n", "\n", "Μετατρέπει XML δεδομένα σε λεξικό\n", "\n", " $ pip install xmltodict" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "OrderedDict([('root',\n", " OrderedDict([('address',\n", " OrderedDict([('city', 'Άρτα'),\n", " ('postalCode', '45221'),\n", " ('state', 'Ήπειρος'),\n", " ('streetAddress',\n", " 'Ανεξαρτησίας 33')])),\n", " ('age', '30'),\n", " ('firstName', 'Γιάννης'),\n", " ('lastName', 'Παπαδόπουλος'),\n", " ('married', 'true'),\n", " ('phoneNumbers',\n", " OrderedDict([('element',\n", " [OrderedDict([('number',\n", " '00302681123456'),\n", " ('type', 'home')]),\n", " OrderedDict([('number',\n", " '00302681654321'),\n", " ('type', 'work')]),\n", " OrderedDict([('number',\n", " '00306971234567'),\n", " ('type',\n", " 'mobile')])])])),\n", " ('email', OrderedDict([('@null', 'true')]))]))])" ] }, "metadata": {}, "execution_count": 10 } ], "source": [ "import xmltodict\n", "\n", "path = \"../../../datasets/person.xml\"\n", "xml_f = open(path, \"r\", encoding=\"utf-8\")\n", "\n", "xmldict = xmltodict.parse(xml_f.read())\n", "xmldict" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Παπαδόπουλος\nΆρτα\n00302681123456\n" ] } ], "source": [ "print(xmldict['root']['lastName'])\n", "print(xmldict['root']['address']['city'])\n", "print(xmldict['root']['phoneNumbers']['element'][0]['number'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }