Difference between revisions of "Python:XML"

From wiki
Jump to navigation Jump to search
Line 1: Line 1:
TODO: Check out [http://effbot.org/zone/element-iterparse.htm interparse]  for parsing large files.
+
TODO:  
 +
* Check [http://effbot.org/zone/element-iterparse.htm interparse]  for parsing large files.
 +
* Check lxml.etree as, [https://lxml.de/1.3/compatibility.html according to the developers], it offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer.
  
 
;from xml.etree import ElementTree
 
;from xml.etree import ElementTree

Revision as of 10:10, 26 April 2019

TODO:

  • Check interparse for parsing large files.
  • Check lxml.etree as, according to the developers, it offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer.
from xml.etree import ElementTree
Load XML-support
tree = get_tree(filename)
Read the file into an XML-tree
root = tree.getroot()
Get the tree root element object
for element in root
Get the elements in root
element.tag
element.attrib
element.text
<tag attribname=attrib>text</tag>

Code example:

#!/usr/bin/env python3

def usage():
    print ("Print XML tags, attributes and text")
    print ("Usage: "+__file__+" <xmlfile>")
    exit()
    return

import os
import sys
from xml.etree import ElementTree

def main():
    if len(sys.argv) == 2:
        filename = sys.argv[1]
    else:
        usage()
    tree = get_tree(filename)
    root = tree.getroot()
    get_all(root)
    print(root)
    return
    
def get_all(element,prefix=""):
    prefix = prefix+"\t"
    for child in element:
        print (prefix,child.tag,child.attrib,child.text)
        get_all(child,prefix)
    return
            
def get_tree(filename):
    if os.path.isfile(filename):
        #with open(filename, 'r') as f:
         #   tree = ElementTree.parse(f)
       tree = ElementTree.parse(filename) 
    else:
        usage() 
    return(tree)
    
main()

Flatten XML

Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available. This is very slow if large file are processed (interparse may be helpfull here)

Output format:

tag1
tag1/tag2.attriv.=value
tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3

def usage():
    print ("Convert any XML to 1 line per leave including attributes and content")
    print ("Usage: "+__file__+" [<xmlfiles>]")
    exit()
    return

import os
import sys

scriptname = os.path.abspath(__file__)

## end of standard part ##

import glob,re

blockl = 65536
minblock = 256

def main():
    files = glob.glob(' '.join(sys.argv[1:]))
    for filename in files:
        if os.path.isfile(filename):
            f1 =  open (filename,"r")
            if f1:
                print('File is open')
                readit(f1)
    return

def readit(f1):
    eof = 0
    taglst = []
    blockrest = ''
    newblock = f1.read(blockl)
    while len(newblock) > 0:
        block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
        while len(block) > minblock:
            block,taglst = doit(block,taglst)
        blockrest = block
        newblock = f1.read(blockl)
        #print('newblocklengte = '+str(len(newblock)))
    block = blockrest
    while len(block) > 1:
        block,taglst = doit(block,taglst)
        #print('Blocklengte = '+str(len(block)))
    return

def doit(block,taglst):
    #print('blockt '+block)
    # Remove garbage from the beginning
    mo1 = re.match('\s*<\?.*?\?>\s*',block)
    if mo1:
        block = block[mo1.end():]
    # Find a tag
    endtag = ''
    mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
    if mo1:
        #print('group0tag '+mo1.group(0))
        try:
            endtag = mo1.group(2)
        except NameError:
            pass
        block = block[mo1.end():]
        #print('mo1group ',mo1.group(1))
        mo2 = re.match('\S*',mo1.group(1))
        if mo2:
            tag = mo2.group(0)
            tagl1 = [tag]
        # Get the tag attributes
        mo2 = re.search('\s+(.*)',mo1.group(1))
        if mo2:
            attdct = get_attr(mo2.group(0))
            attrline = '' 
            for attr in attdct:
                attrline += '.'+attr+'='+attdct[attr]
            tagl1.append(attrline[1:])
    #print('blockc '+block)
    # Find content
    mo1 = re.match('([^<].*?)<',block)
    if mo1:
        #print('group0cont '+mo1.group(0))
        block = block[mo1.end()-1:]
        content = mo1.group(1)
        tagl1.append('='+content)
        
    try:
        taglst.append('.'.join(tagl1))
    except:
        pass
    print('/'.join(taglst))
    # Find end-tag
    if endtag:
        popped = taglst.pop()
        #print('Popped '+popped)
    #print('blocke '+block)
    mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
    if mo1:
        popped = taglst.pop()
        #print('Popped '+mo1.group(0))
        block = block[mo1.end():]
 
    return (block,taglst)
    
def get_attr(attrs):
    attdct = {}
    mo2 = re.finditer('(\w+)=(\S+)',attrs)
    if mo2:
        for att in mo2:
            attdct[att.group(1)] = att.group(2)
    return(attdct)
    
main()

Parse into dict

#!/usr/bin/env python2

def usage():
    print ("Simple and pretty fast XML parser for all file sizes")
    print ("Usage: "+__file__+" <xmlfiles>")
    exit()
    return
import os,sys,re,glob


filespec = '*.xml'
blockl = 100000  # Must be large enough to contain an instance of the highest node you are looking for.


def main():
    result = read_it(filespec)
    print("Heading1\t->\tHeading2t")
    for tag1id in result:
        for tag2id in result[tag1id]:
            print(tag1id+"\t->\t"+tag2id)
    return


def read_it(filespec):
    result = {}
    filelist = glob.glob(filespec)
    for filename in filelist:
        if os.path.isfile(filename):
            with open (filename,"r") as f1:
                blockrest = ''
                newblock = f1.read(blockl)
                # Remove garbage from the beginning of the file
                garbage_mo = re.match('\s*<\?.*?\?>\s*',newblock)
                if garbage_mo:
                    newblock = newblock[garbage_mo.end():]
                while len(newblock) > 0:
                    block = blockrest+newblock)
                    blockrest,result = do_it(block,result)
                    newblock = f1.read(blockl)
    return result


def do_it(block,result):
    tag1list = get_node(block,'TAG1')
    for tag1 in tag1list:
        #Remove processed part of block
        #block = block.replace(tag1,'',1)
        #or everything from the beginning
        dataend = block.find(mecontext)+len(mecontext)
        block = block[dataend:]

        tag1id = get_attr(tag1,'ID')
        if tag1id in result:
            pass
        else:
            result[tag1id] = set()
        tag2list = get_node(tag1,'TAG2')
        for tag2 in tag2list:
            tag2id = get_attr(tag2,'ID')
            content = get_content(tag2,'TAG3')
            result[tag1id].add(tag2id)
    return block,result


def get_node(block,tag):
    taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag',block,re.DOTALL)
    if len(taglist) == 0:
        taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL)
    return taglist


def get_content(block,tag):
    content_mo = re.search('<\s*'+tag+'[\s\>].*?>?(.*?)</'+tag+'\s*>',block,re.DOTALL)
    if content_mo:
        content = content_mo.group(1)
    return content


def get_attr(block,attr):
    attribute_mo = re.search(r' '+attr+'\s*=\s*[\"\'](.*?)[\"\']',block,re.DOTALL)
    if attribute_mo:
        attrvalue = attribute_mo.group(1)
    return(attrvalue)


main()