Difference between revisions of "Python:XML"

From wiki
Jump to navigation Jump to search
 
(10 intermediate revisions by the same user not shown)
Line 1: Line 1:
;from xml.etree import ElementTree
+
[[Category:Python]]
 +
TODO:
 +
* Check [http://effbot.org/zone/element-iterparse.htm interparse]  for parsing large files.
 +
* Check lxml.etree as, [https://lxml.de/1.3/compatibility.html according to the developers], it offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer.
 +
 
 +
[[XML|Generic page on XML]]
 +
 
 +
;Convert XML to a dict
 +
<syntaxhighlight lang=python>
 +
import xmltodict
 +
dict1 = xmltodict.parse(xml-string)
 +
</syntaxhighlight>
 +
 
 +
;from xml.etree import ElementTree as ET
 
:Load XML-support
 
:Load XML-support
  
;tree = get_tree(filename)
+
;tree = ET.parse(filename)
:Read the file into an XML-tree
+
:Read the file or an xml-sting into an XML-tree
  
 
;root = tree.getroot()
 
;root = tree.getroot()
 +
;root = ET.fromstring(xml-string)
 
:Get the tree root element object
 
:Get the tree root element object
  
Line 15: Line 29:
 
;element.text
 
;element.text
 
:<tag attribname=attrib>text</tag>
 
:<tag attribname=attrib>text</tag>
 +
 +
;element.find(elementname)
 +
:Get first subelement by name (element can be root too)
  
 
Code example:
 
Code example:
Line 28: Line 45:
 
import os
 
import os
 
import sys
 
import sys
from xml.etree import ElementTree
+
from xml.etree import ElementTree as ET
  
 
def main():
 
def main():
Line 51: Line 68:
 
     if os.path.isfile(filename):
 
     if os.path.isfile(filename):
 
         #with open(filename, 'r') as f:
 
         #with open(filename, 'r') as f:
         #  tree = ElementTree.parse(f)
+
         #  tree = ET.parse(f)
       tree = ElementTree.parse(filename)  
+
       tree = ET.parse(filename)  
 
     else:
 
     else:
 
         usage()  
 
         usage()  
Line 223: Line 240:
 
                     newblock = newblock[garbage_mo.end():]
 
                     newblock = newblock[garbage_mo.end():]
 
                 while len(newblock) > 0:
 
                 while len(newblock) > 0:
                    #Remove empty spaces and line endings (for formatted files)
+
                     block = blockrest+newblock)
                     block = re.sub('>\s*[\n\r]+\s*<','><',blockrest+newblock)
 
 
                     blockrest,result = do_it(block,result)
 
                     blockrest,result = do_it(block,result)
 
                     newblock = f1.read(blockl)
 
                     newblock = f1.read(blockl)
Line 233: Line 249:
 
     tag1list = get_node(block,'TAG1')
 
     tag1list = get_node(block,'TAG1')
 
     for tag1 in tag1list:
 
     for tag1 in tag1list:
         block = block.replace(tag1,'',1)
+
         #Remove processed part of block
 +
        #block = block.replace(tag1,'',1)
 +
        #or everything from the beginning
 +
        dataend = block.find(mecontext)+len(mecontext)
 +
        block = block[dataend:]
 +
 
 
         tag1id = get_attr(tag1,'ID')
 
         tag1id = get_attr(tag1,'ID')
         try:
+
         if tag1id in result:
             result[tag1id]
+
             pass
         except KeyError:
+
         else:
 
             result[tag1id] = set()
 
             result[tag1id] = set()
 
         tag2list = get_node(tag1,'TAG2')
 
         tag2list = get_node(tag1,'TAG2')
Line 248: Line 269:
  
 
def get_node(block,tag):
 
def get_node(block,tag):
     taglist = re.findall('<\s*'+tag+'\s+.*?</'+tag+'\s*>',block)
+
     taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag+'\s*>',block,re.DOTALL)
 
     if len(taglist) == 0:
 
     if len(taglist) == 0:
         taglist = re.findall('<\s*'+tag+'.*?/>',block)
+
         taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL)
 
     return taglist
 
     return taglist
  
  
 
def get_content(block,tag):
 
def get_content(block,tag):
     content_mo = re.search('<\s*'+tag+'.*?>(.*?)</'+tag+'\s*>',block)
+
    content = None
 +
     content_mo = re.search('<\s*'+tag+'[\s\>].*?>?(.*?)</'+tag+'\s*>',block,re.DOTALL)
 
     if content_mo:
 
     if content_mo:
 
         content = content_mo.group(1)
 
         content = content_mo.group(1)
Line 262: Line 284:
  
 
def get_attr(block,attr):
 
def get_attr(block,attr):
     attribute_mo = re.search(r' '+attr+'\s*=\s*[\"\'](.*?)[\"\']',block)
+
    attrvalue = None
 +
     attribute_mo = re.search(r' '+attr+'\s*=\s*[\"\'](.*?)[\"\']',block,re.DOTALL)
 
     if attribute_mo:
 
     if attribute_mo:
 
         attrvalue = attribute_mo.group(1)
 
         attrvalue = attribute_mo.group(1)

Latest revision as of 16:42, 11 August 2022

TODO:

  • Check interparse for parsing large files.
  • Check lxml.etree as, according to the developers, it offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer.

Generic page on XML

Convert XML to a dict
import xmltodict
dict1 = xmltodict.parse(xml-string)
from xml.etree import ElementTree as ET
Load XML-support
tree = ET.parse(filename)
Read the file or an xml-sting into an XML-tree
root = tree.getroot()
root = ET.fromstring(xml-string)
Get the tree root element object
for element in root
Get the elements in root
element.tag
element.attrib
element.text
<tag attribname=attrib>text</tag>
element.find(elementname)
Get first subelement by name (element can be root too)

Code example:

#!/usr/bin/env python3

def usage():
    print ("Print XML tags, attributes and text")
    print ("Usage: "+__file__+" <xmlfile>")
    exit()
    return

import os
import sys
from xml.etree import ElementTree as ET

def main():
    if len(sys.argv) == 2:
        filename = sys.argv[1]
    else:
        usage()
    tree = get_tree(filename)
    root = tree.getroot()
    get_all(root)
    print(root)
    return
    
def get_all(element,prefix=""):
    prefix = prefix+"\t"
    for child in element:
        print (prefix,child.tag,child.attrib,child.text)
        get_all(child,prefix)
    return
            
def get_tree(filename):
    if os.path.isfile(filename):
        #with open(filename, 'r') as f:
         #   tree = ET.parse(f)
       tree = ET.parse(filename) 
    else:
        usage() 
    return(tree)
    
main()

Flatten XML

Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available. This is very slow if large file are processed (interparse may be helpfull here)

Output format:

tag1
tag1/tag2.attriv.=value
tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3

def usage():
    print ("Convert any XML to 1 line per leave including attributes and content")
    print ("Usage: "+__file__+" [<xmlfiles>]")
    exit()
    return

import os
import sys

scriptname = os.path.abspath(__file__)

## end of standard part ##

import glob,re

blockl = 65536
minblock = 256

def main():
    files = glob.glob(' '.join(sys.argv[1:]))
    for filename in files:
        if os.path.isfile(filename):
            f1 =  open (filename,"r")
            if f1:
                print('File is open')
                readit(f1)
    return

def readit(f1):
    eof = 0
    taglst = []
    blockrest = ''
    newblock = f1.read(blockl)
    while len(newblock) > 0:
        block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
        while len(block) > minblock:
            block,taglst = doit(block,taglst)
        blockrest = block
        newblock = f1.read(blockl)
        #print('newblocklengte = '+str(len(newblock)))
    block = blockrest
    while len(block) > 1:
        block,taglst = doit(block,taglst)
        #print('Blocklengte = '+str(len(block)))
    return

def doit(block,taglst):
    #print('blockt '+block)
    # Remove garbage from the beginning
    mo1 = re.match('\s*<\?.*?\?>\s*',block)
    if mo1:
        block = block[mo1.end():]
    # Find a tag
    endtag = ''
    mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
    if mo1:
        #print('group0tag '+mo1.group(0))
        try:
            endtag = mo1.group(2)
        except NameError:
            pass
        block = block[mo1.end():]
        #print('mo1group ',mo1.group(1))
        mo2 = re.match('\S*',mo1.group(1))
        if mo2:
            tag = mo2.group(0)
            tagl1 = [tag]
        # Get the tag attributes
        mo2 = re.search('\s+(.*)',mo1.group(1))
        if mo2:
            attdct = get_attr(mo2.group(0))
            attrline = '' 
            for attr in attdct:
                attrline += '.'+attr+'='+attdct[attr]
            tagl1.append(attrline[1:])
    #print('blockc '+block)
    # Find content
    mo1 = re.match('([^<].*?)<',block)
    if mo1:
        #print('group0cont '+mo1.group(0))
        block = block[mo1.end()-1:]
        content = mo1.group(1)
        tagl1.append('='+content)
        
    try:
        taglst.append('.'.join(tagl1))
    except:
        pass
    print('/'.join(taglst))
    # Find end-tag
    if endtag:
        popped = taglst.pop()
        #print('Popped '+popped)
    #print('blocke '+block)
    mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
    if mo1:
        popped = taglst.pop()
        #print('Popped '+mo1.group(0))
        block = block[mo1.end():]
 
    return (block,taglst)
    
def get_attr(attrs):
    attdct = {}
    mo2 = re.finditer('(\w+)=(\S+)',attrs)
    if mo2:
        for att in mo2:
            attdct[att.group(1)] = att.group(2)
    return(attdct)
    
main()

Parse into dict

#!/usr/bin/env python2

def usage():
    print ("Simple and pretty fast XML parser for all file sizes")
    print ("Usage: "+__file__+" <xmlfiles>")
    exit()
    return
import os,sys,re,glob


filespec = '*.xml'
blockl = 100000  # Must be large enough to contain an instance of the highest node you are looking for.


def main():
    result = read_it(filespec)
    print("Heading1\t->\tHeading2t")
    for tag1id in result:
        for tag2id in result[tag1id]:
            print(tag1id+"\t->\t"+tag2id)
    return


def read_it(filespec):
    result = {}
    filelist = glob.glob(filespec)
    for filename in filelist:
        if os.path.isfile(filename):
            with open (filename,"r") as f1:
                blockrest = ''
                newblock = f1.read(blockl)
                # Remove garbage from the beginning of the file
                garbage_mo = re.match('\s*<\?.*?\?>\s*',newblock)
                if garbage_mo:
                    newblock = newblock[garbage_mo.end():]
                while len(newblock) > 0:
                    block = blockrest+newblock)
                    blockrest,result = do_it(block,result)
                    newblock = f1.read(blockl)
    return result


def do_it(block,result):
    tag1list = get_node(block,'TAG1')
    for tag1 in tag1list:
        #Remove processed part of block
        #block = block.replace(tag1,'',1)
        #or everything from the beginning
        dataend = block.find(mecontext)+len(mecontext)
        block = block[dataend:]

        tag1id = get_attr(tag1,'ID')
        if tag1id in result:
            pass
        else:
            result[tag1id] = set()
        tag2list = get_node(tag1,'TAG2')
        for tag2 in tag2list:
            tag2id = get_attr(tag2,'ID')
            content = get_content(tag2,'TAG3')
            result[tag1id].add(tag2id)
    return block,result


def get_node(block,tag):
    taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag+'\s*>',block,re.DOTALL)
    if len(taglist) == 0:
        taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL)
    return taglist


def get_content(block,tag):
    content = None
    content_mo = re.search('<\s*'+tag+'[\s\>].*?>?(.*?)</'+tag+'\s*>',block,re.DOTALL)
    if content_mo:
        content = content_mo.group(1)
    return content


def get_attr(block,attr):
    attrvalue = None
    attribute_mo = re.search(r' '+attr+'\s*=\s*[\"\'](.*?)[\"\']',block,re.DOTALL)
    if attribute_mo:
        attrvalue = attribute_mo.group(1)
    return(attrvalue)


main()