Difference between revisions of "Python:XML"

From wiki
Jump to navigation Jump to search
(Created page with ";from xml.etree import ElementTree :Load XML-support ;tree = get_tree(filename) :Read the file into an XML-tree ;root = tree.getroot() :Get the tree root element object ;fo...")
 
Line 16: Line 16:
 
:<tag attribname=attrib>text</tag>
 
:<tag attribname=attrib>text</tag>
  
 
+
Code example:
 
<syntaxhighlight lang=python>
 
<syntaxhighlight lang=python>
 
#!/usr/bin/env python3
 
#!/usr/bin/env python3
Line 56: Line 56:
 
         usage()  
 
         usage()  
 
     return(tree)
 
     return(tree)
 +
   
 +
main()
 +
</syntaxhighlight>
 +
 +
===Flatten XML===
 +
Below code converts any XML file into lines per leaf.
 +
Output format:
 +
tag1
 +
tag1/tag2.attriv.=value
 +
tag1/tag2.attrib.=value/tag3 content
 +
 +
<syntaxhighlight lang=python>
 +
#!/usr/bin/env python3
 +
 +
def usage():
 +
    print ("Convert any XML to 1 line per leave including attributes and content")
 +
    print ("Usage: "+__file__+" [<xmlfiles>]")
 +
    exit()
 +
    return
 +
 +
import os
 +
import sys
 +
 +
scriptname = os.path.abspath(__file__)
 +
 +
## end of standard part ##
 +
 +
import glob,re
 +
 +
blockl = 65536
 +
minblock = 256
 +
 +
def main():
 +
    files = glob.glob(' '.join(sys.argv[1:]))
 +
    for filename in files:
 +
        if os.path.isfile(filename):
 +
            f1 =  open (filename,"r")
 +
            if f1:
 +
                print('File is open')
 +
                readit(f1)
 +
    return
 +
 +
def readit(f1):
 +
    eof = 0
 +
    taglst = []
 +
    blockrest = ''
 +
    newblock = f1.read(blockl)
 +
    while len(newblock) > 0:
 +
        block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
 +
        while len(block) > minblock:
 +
            block,taglst = doit(block,taglst)
 +
        blockrest = block
 +
        newblock = f1.read(blockl)
 +
        #print('newblocklengte = '+str(len(newblock)))
 +
    block = blockrest
 +
    while len(block) > 1:
 +
        block,taglst = doit(block,taglst)
 +
        #print('Blocklengte = '+str(len(block)))
 +
    return
 +
 +
def doit(block,taglst):
 +
    #print('blockt '+block)
 +
    #mo1 = re.finditer('<([^\?]\S*?)(\s+.*?=.*?)?>([^<].*?<)?(\/.*?>)?',block)
 +
    # Remove garbage from the beginning
 +
    mo1 = re.match('\s*<\?.*?\?>\s*',block)
 +
    if mo1:
 +
        block = block[mo1.end():]
 +
    # Find a tag
 +
    endtag = ''
 +
    mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
 +
    if mo1:
 +
        #print('group0tag '+mo1.group(0))
 +
        try:
 +
            endtag = mo1.group(2)
 +
        except NameError:
 +
            pass
 +
        block = block[mo1.end():]
 +
        #print('mo1group ',mo1.group(1))
 +
        mo2 = re.match('\S*',mo1.group(1))
 +
        if mo2:
 +
            tag = mo2.group(0)
 +
            tagl1 = [tag]
 +
        # Get the tag attributes
 +
        mo2 = re.search('\s+(.*)',mo1.group(1))
 +
        if mo2:
 +
            attdct = get_attr(mo2.group(0))
 +
            attrline = ''
 +
            for attr in attdct:
 +
                attrline += '.'+attr+'='+attdct[attr]
 +
            tagl1.append(attrline[1:])
 +
    #print('blockc '+block)
 +
    # Find content
 +
    mo1 = re.match('([^<].*?)<',block)
 +
    if mo1:
 +
        #print('group0cont '+mo1.group(0))
 +
        block = block[mo1.end()-1:]
 +
        content = mo1.group(1)
 +
        tagl1.append('='+content)
 +
       
 +
    try:
 +
        taglst.append('.'.join(tagl1))
 +
    except:
 +
        pass
 +
    print('/'.join(taglst))
 +
    # Find end-tag
 +
    if endtag:
 +
        popped = taglst.pop()
 +
        #print('Popped '+popped)
 +
    #print('blocke '+block)
 +
    mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
 +
    if mo1:
 +
        popped = taglst.pop()
 +
        #print('Popped '+mo1.group(0))
 +
        block = block[mo1.end():]
 +
 +
    return (block,taglst)
 +
   
 +
def get_attr(attrs):
 +
    attdct = {}
 +
    mo2 = re.finditer('(\w+)=(\S+)',attrs)
 +
    if mo2:
 +
        for att in mo2:
 +
            attdct[att.group(1)] = att.group(2)
 +
    return(attdct)
 
      
 
      
 
main()
 
main()
 
</syntaxhighlight>
 
</syntaxhighlight>

Revision as of 23:15, 27 January 2018

from xml.etree import ElementTree
Load XML-support
tree = get_tree(filename)
Read the file into an XML-tree
root = tree.getroot()
Get the tree root element object
for element in root
Get the elements in root
element.tag
element.attrib
element.text
<tag attribname=attrib>text</tag>

Code example:

#!/usr/bin/env python3

def usage():
    print ("Print XML tags, attributes and text")
    print ("Usage: "+__file__+" <xmlfile>")
    exit()
    return

import os
import sys
from xml.etree import ElementTree

def main():
    if len(sys.argv) == 2:
        filename = sys.argv[1]
    else:
        usage()
    tree = get_tree(filename)
    root = tree.getroot()
    get_all(root)
    print(root)
    return
    
def get_all(element,prefix=""):
    prefix = prefix+"\t"
    for child in element:
        print (prefix,child.tag,child.attrib,child.text)
        get_all(child,prefix)
    return
            
def get_tree(filename):
    if os.path.isfile(filename):
        #with open(filename, 'r') as f:
         #   tree = ElementTree.parse(f)
       tree = ElementTree.parse(filename) 
    else:
        usage() 
    return(tree)
    
main()

Flatten XML

Below code converts any XML file into lines per leaf. Output format:

tag1
tag1/tag2.attriv.=value
tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3

def usage():
    print ("Convert any XML to 1 line per leave including attributes and content")
    print ("Usage: "+__file__+" [<xmlfiles>]")
    exit()
    return

import os
import sys

scriptname = os.path.abspath(__file__)

## end of standard part ##

import glob,re

blockl = 65536
minblock = 256

def main():
    files = glob.glob(' '.join(sys.argv[1:]))
    for filename in files:
        if os.path.isfile(filename):
            f1 =  open (filename,"r")
            if f1:
                print('File is open')
                readit(f1)
    return

def readit(f1):
    eof = 0
    taglst = []
    blockrest = ''
    newblock = f1.read(blockl)
    while len(newblock) > 0:
        block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
        while len(block) > minblock:
            block,taglst = doit(block,taglst)
        blockrest = block
        newblock = f1.read(blockl)
        #print('newblocklengte = '+str(len(newblock)))
    block = blockrest
    while len(block) > 1:
        block,taglst = doit(block,taglst)
        #print('Blocklengte = '+str(len(block)))
    return

def doit(block,taglst):
    #print('blockt '+block)
    #mo1 = re.finditer('<([^\?]\S*?)(\s+.*?=.*?)?>([^<].*?<)?(\/.*?>)?',block)
    # Remove garbage from the beginning
    mo1 = re.match('\s*<\?.*?\?>\s*',block)
    if mo1:
        block = block[mo1.end():]
    # Find a tag
    endtag = ''
    mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
    if mo1:
        #print('group0tag '+mo1.group(0))
        try:
            endtag = mo1.group(2)
        except NameError:
            pass
        block = block[mo1.end():]
        #print('mo1group ',mo1.group(1))
        mo2 = re.match('\S*',mo1.group(1))
        if mo2:
            tag = mo2.group(0)
            tagl1 = [tag]
        # Get the tag attributes
        mo2 = re.search('\s+(.*)',mo1.group(1))
        if mo2:
            attdct = get_attr(mo2.group(0))
            attrline = '' 
            for attr in attdct:
                attrline += '.'+attr+'='+attdct[attr]
            tagl1.append(attrline[1:])
    #print('blockc '+block)
    # Find content
    mo1 = re.match('([^<].*?)<',block)
    if mo1:
        #print('group0cont '+mo1.group(0))
        block = block[mo1.end()-1:]
        content = mo1.group(1)
        tagl1.append('='+content)
        
    try:
        taglst.append('.'.join(tagl1))
    except:
        pass
    print('/'.join(taglst))
    # Find end-tag
    if endtag:
        popped = taglst.pop()
        #print('Popped '+popped)
    #print('blocke '+block)
    mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
    if mo1:
        popped = taglst.pop()
        #print('Popped '+mo1.group(0))
        block = block[mo1.end():]
 
    return (block,taglst)
    
def get_attr(attrs):
    attdct = {}
    mo2 = re.finditer('(\w+)=(\S+)',attrs)
    if mo2:
        for att in mo2:
            attdct[att.group(1)] = att.group(2)
    return(attdct)
    
main()