Python:XML

From wiki
Jump to navigation Jump to search
from xml.etree import ElementTree
Load XML-support
tree = get_tree(filename)
Read the file into an XML-tree
root = tree.getroot()
Get the tree root element object
for element in root
Get the elements in root
element.tag
element.attrib
element.text
<tag attribname=attrib>text</tag>

Code example:

#!/usr/bin/env python3

def usage():
    print ("Print XML tags, attributes and text")
    print ("Usage: "+__file__+" <xmlfile>")
    exit()
    return

import os
import sys
from xml.etree import ElementTree

def main():
    if len(sys.argv) == 2:
        filename = sys.argv[1]
    else:
        usage()
    tree = get_tree(filename)
    root = tree.getroot()
    get_all(root)
    print(root)
    return
    
def get_all(element,prefix=""):
    prefix = prefix+"\t"
    for child in element:
        print (prefix,child.tag,child.attrib,child.text)
        get_all(child,prefix)
    return
            
def get_tree(filename):
    if os.path.isfile(filename):
        #with open(filename, 'r') as f:
         #   tree = ElementTree.parse(f)
       tree = ElementTree.parse(filename) 
    else:
        usage() 
    return(tree)
    
main()

Flatten XML

Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available.

Output format:

tag1
tag1/tag2.attriv.=value
tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3

def usage():
    print ("Convert any XML to 1 line per leave including attributes and content")
    print ("Usage: "+__file__+" [<xmlfiles>]")
    exit()
    return

import os
import sys

scriptname = os.path.abspath(__file__)

## end of standard part ##

import glob,re

blockl = 65536
minblock = 256

def main():
    files = glob.glob(' '.join(sys.argv[1:]))
    for filename in files:
        if os.path.isfile(filename):
            f1 =  open (filename,"r")
            if f1:
                print('File is open')
                readit(f1)
    return

def readit(f1):
    eof = 0
    taglst = []
    blockrest = ''
    newblock = f1.read(blockl)
    while len(newblock) > 0:
        block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
        while len(block) > minblock:
            block,taglst = doit(block,taglst)
        blockrest = block
        newblock = f1.read(blockl)
        #print('newblocklengte = '+str(len(newblock)))
    block = blockrest
    while len(block) > 1:
        block,taglst = doit(block,taglst)
        #print('Blocklengte = '+str(len(block)))
    return

def doit(block,taglst):
    #print('blockt '+block)
    # Remove garbage from the beginning
    mo1 = re.match('\s*<\?.*?\?>\s*',block)
    if mo1:
        block = block[mo1.end():]
    # Find a tag
    endtag = ''
    mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
    if mo1:
        #print('group0tag '+mo1.group(0))
        try:
            endtag = mo1.group(2)
        except NameError:
            pass
        block = block[mo1.end():]
        #print('mo1group ',mo1.group(1))
        mo2 = re.match('\S*',mo1.group(1))
        if mo2:
            tag = mo2.group(0)
            tagl1 = [tag]
        # Get the tag attributes
        mo2 = re.search('\s+(.*)',mo1.group(1))
        if mo2:
            attdct = get_attr(mo2.group(0))
            attrline = '' 
            for attr in attdct:
                attrline += '.'+attr+'='+attdct[attr]
            tagl1.append(attrline[1:])
    #print('blockc '+block)
    # Find content
    mo1 = re.match('([^<].*?)<',block)
    if mo1:
        #print('group0cont '+mo1.group(0))
        block = block[mo1.end()-1:]
        content = mo1.group(1)
        tagl1.append('='+content)
        
    try:
        taglst.append('.'.join(tagl1))
    except:
        pass
    print('/'.join(taglst))
    # Find end-tag
    if endtag:
        popped = taglst.pop()
        #print('Popped '+popped)
    #print('blocke '+block)
    mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
    if mo1:
        popped = taglst.pop()
        #print('Popped '+mo1.group(0))
        block = block[mo1.end():]
 
    return (block,taglst)
    
def get_attr(attrs):
    attdct = {}
    mo2 = re.finditer('(\w+)=(\S+)',attrs)
    if mo2:
        for att in mo2:
            attdct[att.group(1)] = att.group(2)
    return(attdct)
    
main()