Difference between revisions of "Python:XML"

Latest revision as of 16:42, 11 August 2022

TODO:

Check interparse for parsing large files.
Check lxml.etree as, according to the developers, it offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer.

Convert XML to a dict

import xmltodict
dict1 = xmltodict.parse(xml-string)

from xml.etree import ElementTree as ET: Load XML-support

tree = ET.parse(filename): Read the file or an xml-sting into an XML-tree

root = tree.getroot()
root = ET.fromstring(xml-string): Get the tree root element object

for element in root: Get the elements in root

element.tag
element.attrib
element.text: <tag attribname=attrib>text</tag>

element.find(elementname): Get first subelement by name (element can be root too)

Code example:

#!/usr/bin/env python3

def usage():
    print ("Print XML tags, attributes and text")
    print ("Usage: "+__file__+" <xmlfile>")
    exit()
    return

import os
import sys
from xml.etree import ElementTree as ET

def main():
    if len(sys.argv) == 2:
        filename = sys.argv[1]
    else:
        usage()
    tree = get_tree(filename)
    root = tree.getroot()
    get_all(root)
    print(root)
    return
    
def get_all(element,prefix=""):
    prefix = prefix+"\t"
    for child in element:
        print (prefix,child.tag,child.attrib,child.text)
        get_all(child,prefix)
    return
            
def get_tree(filename):
    if os.path.isfile(filename):
        #with open(filename, 'r') as f:
         #   tree = ET.parse(f)
       tree = ET.parse(filename) 
    else:
        usage() 
    return(tree)
    
main()

Flatten XML

Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available. This is very slow if large file are processed (interparse may be helpfull here)

Output format:

tag1
tag1/tag2.attriv.=value
tag1/tag2.attrib.=value/tag3 content

#!/usr/bin/env python3

def usage():
    print ("Convert any XML to 1 line per leave including attributes and content")
    print ("Usage: "+__file__+" [<xmlfiles>]")
    exit()
    return

import os
import sys

scriptname = os.path.abspath(__file__)

## end of standard part ##

import glob,re

blockl = 65536
minblock = 256

def main():
    files = glob.glob(' '.join(sys.argv[1:]))
    for filename in files:
        if os.path.isfile(filename):
            f1 =  open (filename,"r")
            if f1:
                print('File is open')
                readit(f1)
    return

def readit(f1):
    eof = 0
    taglst = []
    blockrest = ''
    newblock = f1.read(blockl)
    while len(newblock) > 0:
        block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
        while len(block) > minblock:
            block,taglst = doit(block,taglst)
        blockrest = block
        newblock = f1.read(blockl)
        #print('newblocklengte = '+str(len(newblock)))
    block = blockrest
    while len(block) > 1:
        block,taglst = doit(block,taglst)
        #print('Blocklengte = '+str(len(block)))
    return

def doit(block,taglst):
    #print('blockt '+block)
    # Remove garbage from the beginning
    mo1 = re.match('\s*<\?.*?\?>\s*',block)
    if mo1:
        block = block[mo1.end():]
    # Find a tag
    endtag = ''
    mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
    if mo1:
        #print('group0tag '+mo1.group(0))
        try:
            endtag = mo1.group(2)
        except NameError:
            pass
        block = block[mo1.end():]
        #print('mo1group ',mo1.group(1))
        mo2 = re.match('\S*',mo1.group(1))
        if mo2:
            tag = mo2.group(0)
            tagl1 = [tag]
        # Get the tag attributes
        mo2 = re.search('\s+(.*)',mo1.group(1))
        if mo2:
            attdct = get_attr(mo2.group(0))
            attrline = '' 
            for attr in attdct:
                attrline += '.'+attr+'='+attdct[attr]
            tagl1.append(attrline[1:])
    #print('blockc '+block)
    # Find content
    mo1 = re.match('([^<].*?)<',block)
    if mo1:
        #print('group0cont '+mo1.group(0))
        block = block[mo1.end()-1:]
        content = mo1.group(1)
        tagl1.append('='+content)
        
    try:
        taglst.append('.'.join(tagl1))
    except:
        pass
    print('/'.join(taglst))
    # Find end-tag
    if endtag:
        popped = taglst.pop()
        #print('Popped '+popped)
    #print('blocke '+block)
    mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
    if mo1:
        popped = taglst.pop()
        #print('Popped '+mo1.group(0))
        block = block[mo1.end():]
 
    return (block,taglst)
    
def get_attr(attrs):
    attdct = {}
    mo2 = re.finditer('(\w+)=(\S+)',attrs)
    if mo2:
        for att in mo2:
            attdct[att.group(1)] = att.group(2)
    return(attdct)
    
main()

Parse into dict

#!/usr/bin/env python2

def usage():
    print ("Simple and pretty fast XML parser for all file sizes")
    print ("Usage: "+__file__+" <xmlfiles>")
    exit()
    return
import os,sys,re,glob


filespec = '*.xml'
blockl = 100000  # Must be large enough to contain an instance of the highest node you are looking for.


def main():
    result = read_it(filespec)
    print("Heading1\t->\tHeading2t")
    for tag1id in result:
        for tag2id in result[tag1id]:
            print(tag1id+"\t->\t"+tag2id)
    return


def read_it(filespec):
    result = {}
    filelist = glob.glob(filespec)
    for filename in filelist:
        if os.path.isfile(filename):
            with open (filename,"r") as f1:
                blockrest = ''
                newblock = f1.read(blockl)
                # Remove garbage from the beginning of the file
                garbage_mo = re.match('\s*<\?.*?\?>\s*',newblock)
                if garbage_mo:
                    newblock = newblock[garbage_mo.end():]
                while len(newblock) > 0:
                    block = blockrest+newblock)
                    blockrest,result = do_it(block,result)
                    newblock = f1.read(blockl)
    return result


def do_it(block,result):
    tag1list = get_node(block,'TAG1')
    for tag1 in tag1list:
        #Remove processed part of block
        #block = block.replace(tag1,'',1)
        #or everything from the beginning
        dataend = block.find(mecontext)+len(mecontext)
        block = block[dataend:]

        tag1id = get_attr(tag1,'ID')
        if tag1id in result:
            pass
        else:
            result[tag1id] = set()
        tag2list = get_node(tag1,'TAG2')
        for tag2 in tag2list:
            tag2id = get_attr(tag2,'ID')
            content = get_content(tag2,'TAG3')
            result[tag1id].add(tag2id)
    return block,result


def get_node(block,tag):
    taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag+'\s*>',block,re.DOTALL)
    if len(taglist) == 0:
        taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL)
    return taglist


def get_content(block,tag):
    content = None
    content_mo = re.search('<\s*'+tag+'[\s\>].*?>?(.*?)</'+tag+'\s*>',block,re.DOTALL)
    if content_mo:
        content = content_mo.group(1)
    return content


def get_attr(block,attr):
    attrvalue = None
    attribute_mo = re.search(r' '+attr+'\s*=\s*[\"\'](.*?)[\"\']',block,re.DOTALL)
    if attribute_mo:
        attrvalue = attribute_mo.group(1)
    return(attrvalue)


main()

Difference between revisions of "Python:XML"

Latest revision as of 16:42, 11 August 2022

Flatten XML

Parse into dict

Navigation menu

Search

@@ Line 1: / Line 1: @@
-;from xml.etree import ElementTree
+[[Category:Python]]
+TODO:
+* Check [http://effbot.org/zone/element-iterparse.htm interparse]  for parsing large files.
+* Check lxml.etree as, [https://lxml.de/1.3/compatibility.html according to the developers], it offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer.
+[[XML|Generic page on XML]]
+;Convert XML to a dict
+<syntaxhighlight lang=python>
+import xmltodict
+dict1 = xmltodict.parse(xml-string)
+</syntaxhighlight>
+;from xml.etree import ElementTree as ET
 :Load XML-support
-;tree = get_tree(filename)
+;tree = ET.parse(filename)
-:Read the file into an XML-tree
+:Read the file or an xml-sting into an XML-tree
 ;root = tree.getroot()
+;root = ET.fromstring(xml-string)
 :Get the tree root element object
@@ Line 16: / Line 30: @@
 :<tag attribname=attrib>text</tag>
+;element.find(elementname)
+:Get first subelement by name (element can be root too)
+Code example:
 <syntaxhighlight lang=python>
 #!/usr/bin/env python3
@@ Line 28: / Line 45: @@
 import os
 import sys
-from xml.etree import ElementTree
+from xml.etree import ElementTree as ET
 def main():
@@ Line 51: / Line 68: @@
      if os.path.isfile(filename):
          #with open(filename, 'r') as f:
-          #   tree = ElementTree.parse(f)
+          #   tree = ET.parse(f)
-        tree = ElementTree.parse(filename)
+        tree = ET.parse(filename)
      else:
          usage()
      return(tree)
+main()
+</syntaxhighlight>
+===Flatten XML===
+Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available. This is very slow if large file are processed (interparse may be helpfull here)
+Output format:
+ tag1
+ tag1/tag2.attriv.=value
+ tag1/tag2.attrib.=value/tag3 content
+<syntaxhighlight lang=python>
+#!/usr/bin/env python3
+def usage():
+    print ("Convert any XML to 1 line per leave including attributes and content")
+    print ("Usage: "+__file__+" [<xmlfiles>]")
+    exit()
+    return
+import os
+import sys
+scriptname = os.path.abspath(__file__)
+## end of standard part ##
+import glob,re
+blockl = 65536
+minblock = 256
+def main():
+    files = glob.glob(' '.join(sys.argv[1:]))
+    for filename in files:
+        if os.path.isfile(filename):
+            f1 =  open (filename,"r")
+            if f1:
+                print('File is open')
+                readit(f1)
+    return
+def readit(f1):
+    eof = 0
+    taglst = []
+    blockrest = ''
+    newblock = f1.read(blockl)
+    while len(newblock) > 0:
+        block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
+        while len(block) > minblock:
+            block,taglst = doit(block,taglst)
+        blockrest = block
+        newblock = f1.read(blockl)
+        #print('newblocklengte = '+str(len(newblock)))
+    block = blockrest
+    while len(block) > 1:
+        block,taglst = doit(block,taglst)
+        #print('Blocklengte = '+str(len(block)))
+    return
+def doit(block,taglst):
+    #print('blockt '+block)
+    # Remove garbage from the beginning
+    mo1 = re.match('\s*<\?.*?\?>\s*',block)
+    if mo1:
+        block = block[mo1.end():]
+    # Find a tag
+    endtag = ''
+    mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
+    if mo1:
+        #print('group0tag '+mo1.group(0))
+        try:
+            endtag = mo1.group(2)
+        except NameError:
+            pass
+        block = block[mo1.end():]
+        #print('mo1group ',mo1.group(1))
+        mo2 = re.match('\S*',mo1.group(1))
+        if mo2:
+            tag = mo2.group(0)
+            tagl1 = [tag]
+        # Get the tag attributes
+        mo2 = re.search('\s+(.*)',mo1.group(1))
+        if mo2:
+            attdct = get_attr(mo2.group(0))
+            attrline = ''
+            for attr in attdct:
+                attrline += '.'+attr+'='+attdct[attr]
+            tagl1.append(attrline[1:])
+    #print('blockc '+block)
+    # Find content
+    mo1 = re.match('([^<].*?)<',block)
+    if mo1:
+        #print('group0cont '+mo1.group(0))
+        block = block[mo1.end()-1:]
+        content = mo1.group(1)
+        tagl1.append('='+content)
+    try:
+        taglst.append('.'.join(tagl1))
+    except:
+        pass
+    print('/'.join(taglst))
+    # Find end-tag
+    if endtag:
+        popped = taglst.pop()
+        #print('Popped '+popped)
+    #print('blocke '+block)
+    mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
+    if mo1:
+        popped = taglst.pop()
+        #print('Popped '+mo1.group(0))
+        block = block[mo1.end():]
+    return (block,taglst)
+def get_attr(attrs):
+    attdct = {}
+    mo2 = re.finditer('(\w+)=(\S+)',attrs)
+    if mo2:
+        for att in mo2:
+            attdct[att.group(1)] = att.group(2)
+    return(attdct)
+main()
+</syntaxhighlight>
+===Parse into [[Python:DataTypes#dict|dict]]===
+<syntaxhighlight lang=python>
+#!/usr/bin/env python2
+def usage():
+    print ("Simple and pretty fast XML parser for all file sizes")
+    print ("Usage: "+__file__+" <xmlfiles>")
+    exit()
+    return
+import os,sys,re,glob
+filespec = '*.xml'
+blockl = 100000  # Must be large enough to contain an instance of the highest node you are looking for.
+def main():
+    result = read_it(filespec)
+    print("Heading1\t->\tHeading2t")
+    for tag1id in result:
+        for tag2id in result[tag1id]:
+            print(tag1id+"\t->\t"+tag2id)
+    return
+def read_it(filespec):
+    result = {}
+    filelist = glob.glob(filespec)
+    for filename in filelist:
+        if os.path.isfile(filename):
+            with open (filename,"r") as f1:
+                blockrest = ''
+                newblock = f1.read(blockl)
+                # Remove garbage from the beginning of the file
+                garbage_mo = re.match('\s*<\?.*?\?>\s*',newblock)
+                if garbage_mo:
+                    newblock = newblock[garbage_mo.end():]
+                while len(newblock) > 0:
+                    block = blockrest+newblock)
+                    blockrest,result = do_it(block,result)
+                    newblock = f1.read(blockl)
+    return result
+def do_it(block,result):
+    tag1list = get_node(block,'TAG1')
+    for tag1 in tag1list:
+        #Remove processed part of block
+        #block = block.replace(tag1,'',1)
+        #or everything from the beginning
+        dataend = block.find(mecontext)+len(mecontext)
+        block = block[dataend:]
+        tag1id = get_attr(tag1,'ID')
+        if tag1id in result:
+            pass
+        else:
+            result[tag1id] = set()
+        tag2list = get_node(tag1,'TAG2')
+        for tag2 in tag2list:
+            tag2id = get_attr(tag2,'ID')
+            content = get_content(tag2,'TAG3')
+            result[tag1id].add(tag2id)
+    return block,result
+def get_node(block,tag):
+    taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag+'\s*>',block,re.DOTALL)
+    if len(taglist) == 0:
+        taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL)
+    return taglist
+def get_content(block,tag):
+    content = None
+    content_mo = re.search('<\s*'+tag+'[\s\>].*?>?(.*?)</'+tag+'\s*>',block,re.DOTALL)
+    if content_mo:
+        content = content_mo.group(1)
+    return content
+def get_attr(block,attr):
+    attrvalue = None
+    attribute_mo = re.search(r' '+attr+'\s*=\s*[\"\'](.*?)[\"\']',block,re.DOTALL)
+    if attribute_mo:
+        attrvalue = attribute_mo.group(1)
+    return(attrvalue)
 main()
 </syntaxhighlight>