{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Handling XML Files\n",
    "\n",
    "This notebook showcases methods to read XML type data using:\n",
    "+ xml library\n",
    "+ xmltodict library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import required libraries\n",
    "import xml.etree.ElementTree as ET\n",
    "import xmltodict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def print_nested_dicts(nested_dict,indent_level=0):\n",
    "    \"\"\"This function prints a nested dict object\n",
    "    Args:\n",
    "        nested_dict (dict): the dictionary to be printed\n",
    "        indent_level (int): the indentation level for nesting\n",
    "    Returns:\n",
    "        None\n",
    "\n",
    "    \"\"\"\n",
    "    \n",
    "    for key, val in nested_dict.items():\n",
    "        if isinstance(val, dict):\n",
    "          print(\"{0} : \".format(key))\n",
    "          print_nested_dicts(val,indent_level=indent_level+1)\n",
    "        elif isinstance(val,list):\n",
    "            print(\"{0} : \".format(key))\n",
    "            for rec in val:\n",
    "                print_nested_dicts(rec,indent_level=indent_level+1)\n",
    "        else:\n",
    "          print(\"{0}{1} : {2}\".format(\"\\t\"*indent_level,key, val))\n",
    "          \n",
    "def print_xml_tree(xml_root,indent_level=0):\n",
    "    \"\"\"This function prints a nested dict object\n",
    "    Args:\n",
    "        xml_root (dict): the xml tree to be printed\n",
    "        indent_level (int): the indentation level for nesting\n",
    "    Returns:\n",
    "        None\n",
    "\n",
    "    \"\"\"\n",
    "    for child in xml_root:\n",
    "            print(\"{0}tag:{1}, attribute:{2}\".format(\n",
    "                                                \"\\t\"*indent_level,\n",
    "                                                child.tag,\n",
    "                                                child.attrib))\n",
    "                                                \n",
    "            print(\"{0}tag data:{1}\".format(\"\\t\"*indent_level,\n",
    "                                            child.text))\n",
    "                                            \n",
    "            print_xml_tree(child,indent_level=indent_level+1)\n",
    "            \n",
    "\n",
    "\n",
    "def read_xml(file_name):\n",
    "    \"\"\"This function extracts and prints XML content from a given file\n",
    "    Args:\n",
    "        file_name (str): file path to be read\n",
    "    Returns:\n",
    "        None\n",
    "\n",
    "    \"\"\"\n",
    "    try:\n",
    "        tree = ET.parse(file_name)\n",
    "        root = tree.getroot()\n",
    "        \n",
    "        print(\"Root tag:{0}\".format(root.tag))\n",
    "        print(\"Attributes of Root:: {0}\".format(root.attrib))\n",
    "        \n",
    "        print_xml_tree(root)\n",
    "            \n",
    "    except IOError:\n",
    "        raise IOError(\"File path incorrect/ File not found\")\n",
    "    except Exception:\n",
    "        raise\n",
    "\n",
    "    \n",
    "\n",
    "def read_xml2dict_xml(file_name):\n",
    "    \"\"\"This function extracts and prints xml content from a file using xml2dict\n",
    "    Args:\n",
    "        file_name (str): file path to be read\n",
    "    Returns:\n",
    "        None\n",
    "\n",
    "    \"\"\"\n",
    "    try:\n",
    "        xml_filedata = open(file_name).read() \n",
    "        ordered_dict = xmltodict.parse(xml_filedata)\n",
    "        \n",
    "        print_nested_dicts(ordered_dict)\n",
    "    except IOError:\n",
    "        raise IOError(\"File path incorrect/ File not found\")\n",
    "    except ValueError:\n",
    "        ValueError(\"XML file has errors\")\n",
    "    except Exception:\n",
    "        raise "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parse using XML module\n",
    "\n",
    "The read_xml() function takes the input file name as input parameter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Root tag:records\n",
      "Attributes of Root:: {'attr': 'sample xml records'}\n",
      "tag:record, attribute:{'name': 'rec_1'}\n",
      "tag data:\n",
      "\t  \n",
      "\ttag:sub_element, attribute:{}\n",
      "\ttag data:\n",
      "\t    \n",
      "\t\ttag:detail1, attribute:{}\n",
      "\t\ttag data:Attribute 1\n",
      "\t\ttag:detail2, attribute:{}\n",
      "\t\ttag data:2\n",
      "\ttag:sub_element_with_attr, attribute:{'attr': 'complex'}\n",
      "\ttag data:\n",
      "\t    Sub_Element_Text\n",
      "\t  \n",
      "\ttag:sub_element_only_attr, attribute:{'attr_val': 'only_attr'}\n",
      "\ttag data:None\n",
      "tag:record, attribute:{'name': 'rec_2'}\n",
      "tag data:\n",
      "\t  \n",
      "\ttag:sub_element, attribute:{}\n",
      "\ttag data:\n",
      "\t    \n",
      "\t\ttag:detail1, attribute:{}\n",
      "\t\ttag data:Attribute 1\n",
      "\t\ttag:detail2, attribute:{}\n",
      "\t\ttag data:2\n",
      "\ttag:sub_element_with_attr, attribute:{'attr': 'complex'}\n",
      "\ttag data:\n",
      "\t    Sub_Element_Text\n",
      "\t  \n",
      "\ttag:sub_element_only_attr, attribute:{'attr_val': 'only_attr'}\n",
      "\ttag data:None\n",
      "tag:record, attribute:{'name': 'rec_3'}\n",
      "tag data:\n",
      "\t  \n",
      "\ttag:sub_element, attribute:{}\n",
      "\ttag data:\n",
      "\t    \n",
      "\t\ttag:detail1, attribute:{}\n",
      "\t\ttag data:Attribute 1\n",
      "\t\ttag:detail2, attribute:{}\n",
      "\t\ttag data:2\n",
      "\ttag:sub_element_with_attr, attribute:{'attr': 'complex'}\n",
      "\ttag data:\n",
      "\t    Sub_Element_Text\n",
      "\t  \n",
      "\ttag:sub_element_only_attr, attribute:{'attr_val': 'only_attr'}\n",
      "\ttag data:None\n"
     ]
    }
   ],
   "source": [
    "read_xml(r'sample_xml.xml')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The function generates a nested output resembling the structure of the XML itself. This function provides flexibility in terms of identifying the structure and parsing XML nodes as required.\n",
    "\n",
    "\n",
    "--------"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parse using xmltodict\n",
    "\n",
    "The read_xml2dict_xml() function takes the input file name  as input parameter. It uses xmltodict to do the heavy lifting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "records : \n",
      "\t@attr : sample xml records\n",
      "record : \n",
      "\t\t@name : rec_1\n",
      "sub_element : \n",
      "\t\t\tdetail1 : Attribute 1\n",
      "\t\t\tdetail2 : 2\n",
      "sub_element_with_attr : \n",
      "\t\t\t@attr : complex\n",
      "\t\t\t#text : Sub_Element_Text\n",
      "sub_element_only_attr : \n",
      "\t\t\t@attr_val : only_attr\n",
      "\t\t@name : rec_2\n",
      "sub_element : \n",
      "\t\t\tdetail1 : Attribute 1\n",
      "\t\t\tdetail2 : 2\n",
      "sub_element_with_attr : \n",
      "\t\t\t@attr : complex\n",
      "\t\t\t#text : Sub_Element_Text\n",
      "sub_element_only_attr : \n",
      "\t\t\t@attr_val : only_attr\n",
      "\t\t@name : rec_3\n",
      "sub_element : \n",
      "\t\t\tdetail1 : Attribute 1\n",
      "\t\t\tdetail2 : 2\n",
      "sub_element_with_attr : \n",
      "\t\t\t@attr : complex\n",
      "\t\t\t#text : Sub_Element_Text\n",
      "sub_element_only_attr : \n",
      "\t\t\t@attr_val : only_attr\n"
     ]
    }
   ],
   "source": [
    "read_xml2dict_xml(r'sample_xml.xml')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The output in the above cell shows how xmltodict reads an XML file. The function utilizes the xmltodict library to perform the node traversal and extract relevant information. "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}