Python:XML

From wiki
Revision as of 23:15, 27 January 2018 by Hdridder (talk | contribs)
Jump to navigation Jump to search
from xml.etree import ElementTree
Load XML-support
tree = get_tree(filename)
Read the file into an XML-tree
root = tree.getroot()
Get the tree root element object
for element in root
Get the elements in root
element.tag
element.attrib
element.text
<tag attribname=attrib>text</tag>

Code example:

#!/usr/bin/env python3

def usage():
    print ("Print XML tags, attributes and text")
    print ("Usage: "+__file__+" <xmlfile>")
    exit()
    return

import os
import sys
from xml.etree import ElementTree

def main():
    if len(sys.argv) == 2:
        filename = sys.argv[1]
    else:
        usage()
    tree = get_tree(filename)
    root = tree.getroot()
    get_all(root)
    print(root)
    return
    
def get_all(element,prefix=""):
    prefix = prefix+"\t"
    for child in element:
        print (prefix,child.tag,child.attrib,child.text)
        get_all(child,prefix)
    return
            
def get_tree(filename):
    if os.path.isfile(filename):
        #with open(filename, 'r') as f:
         #   tree = ElementTree.parse(f)
       tree = ElementTree.parse(filename) 
    else:
        usage() 
    return(tree)
    
main()

Flatten XML

Below code converts any XML file into lines per leaf. Output format:

tag1
tag1/tag2.attriv.=value
tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3

def usage():
    print ("Convert any XML to 1 line per leave including attributes and content")
    print ("Usage: "+__file__+" [<xmlfiles>]")
    exit()
    return

import os
import sys

scriptname = os.path.abspath(__file__)

## end of standard part ##

import glob,re

blockl = 65536
minblock = 256

def main():
    files = glob.glob(' '.join(sys.argv[1:]))
    for filename in files:
        if os.path.isfile(filename):
            f1 =  open (filename,"r")
            if f1:
                print('File is open')
                readit(f1)
    return

def readit(f1):
    eof = 0
    taglst = []
    blockrest = ''
    newblock = f1.read(blockl)
    while len(newblock) > 0:
        block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
        while len(block) > minblock:
            block,taglst = doit(block,taglst)
        blockrest = block
        newblock = f1.read(blockl)
        #print('newblocklengte = '+str(len(newblock)))
    block = blockrest
    while len(block) > 1:
        block,taglst = doit(block,taglst)
        #print('Blocklengte = '+str(len(block)))
    return

def doit(block,taglst):
    #print('blockt '+block)
    #mo1 = re.finditer('<([^\?]\S*?)(\s+.*?=.*?)?>([^<].*?<)?(\/.*?>)?',block)
    # Remove garbage from the beginning
    mo1 = re.match('\s*<\?.*?\?>\s*',block)
    if mo1:
        block = block[mo1.end():]
    # Find a tag
    endtag = ''
    mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
    if mo1:
        #print('group0tag '+mo1.group(0))
        try:
            endtag = mo1.group(2)
        except NameError:
            pass
        block = block[mo1.end():]
        #print('mo1group ',mo1.group(1))
        mo2 = re.match('\S*',mo1.group(1))
        if mo2:
            tag = mo2.group(0)
            tagl1 = [tag]
        # Get the tag attributes
        mo2 = re.search('\s+(.*)',mo1.group(1))
        if mo2:
            attdct = get_attr(mo2.group(0))
            attrline = '' 
            for attr in attdct:
                attrline += '.'+attr+'='+attdct[attr]
            tagl1.append(attrline[1:])
    #print('blockc '+block)
    # Find content
    mo1 = re.match('([^<].*?)<',block)
    if mo1:
        #print('group0cont '+mo1.group(0))
        block = block[mo1.end()-1:]
        content = mo1.group(1)
        tagl1.append('='+content)
        
    try:
        taglst.append('.'.join(tagl1))
    except:
        pass
    print('/'.join(taglst))
    # Find end-tag
    if endtag:
        popped = taglst.pop()
        #print('Popped '+popped)
    #print('blocke '+block)
    mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
    if mo1:
        popped = taglst.pop()
        #print('Popped '+mo1.group(0))
        block = block[mo1.end():]
 
    return (block,taglst)
    
def get_attr(attrs):
    attdct = {}
    mo2 = re.finditer('(\w+)=(\S+)',attrs)
    if mo2:
        for att in mo2:
            attdct[att.group(1)] = att.group(2)
    return(attdct)
    
main()