Difference between revisions of "Python:XML"
Jump to navigation
Jump to search
Line 268: | Line 268: | ||
def get_node(block,tag): | def get_node(block,tag): | ||
− | taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag,block,re.DOTALL) | + | taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag+'\s*>',block,re.DOTALL) |
if len(taglist) == 0: | if len(taglist) == 0: | ||
taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL) | taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL) |
Revision as of 16:12, 15 December 2020
TODO:
- Check interparse for parsing large files.
- Check lxml.etree as, according to the developers, it offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer.
- Convert XML to a dict
import xmltodict
dict1 = xmltodict.parse(xml-string)
- from xml.etree import ElementTree as ET
- Load XML-support
- tree = ET.parse(filename)
- Read the file or an xml-sting into an XML-tree
- root = tree.getroot()
- root = ET.fromstring(xml-string)
- Get the tree root element object
- for element in root
- Get the elements in root
- element.tag
- element.attrib
- element.text
- <tag attribname=attrib>text</tag>
- element.find(elementname)
- Get first subelement by name (element can be root too)
Code example:
#!/usr/bin/env python3
def usage():
print ("Print XML tags, attributes and text")
print ("Usage: "+__file__+" <xmlfile>")
exit()
return
import os
import sys
from xml.etree import ElementTree as ET
def main():
if len(sys.argv) == 2:
filename = sys.argv[1]
else:
usage()
tree = get_tree(filename)
root = tree.getroot()
get_all(root)
print(root)
return
def get_all(element,prefix=""):
prefix = prefix+"\t"
for child in element:
print (prefix,child.tag,child.attrib,child.text)
get_all(child,prefix)
return
def get_tree(filename):
if os.path.isfile(filename):
#with open(filename, 'r') as f:
# tree = ET.parse(f)
tree = ET.parse(filename)
else:
usage()
return(tree)
main()
Flatten XML
Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available. This is very slow if large file are processed (interparse may be helpfull here)
Output format:
tag1 tag1/tag2.attriv.=value tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3
def usage():
print ("Convert any XML to 1 line per leave including attributes and content")
print ("Usage: "+__file__+" [<xmlfiles>]")
exit()
return
import os
import sys
scriptname = os.path.abspath(__file__)
## end of standard part ##
import glob,re
blockl = 65536
minblock = 256
def main():
files = glob.glob(' '.join(sys.argv[1:]))
for filename in files:
if os.path.isfile(filename):
f1 = open (filename,"r")
if f1:
print('File is open')
readit(f1)
return
def readit(f1):
eof = 0
taglst = []
blockrest = ''
newblock = f1.read(blockl)
while len(newblock) > 0:
block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
while len(block) > minblock:
block,taglst = doit(block,taglst)
blockrest = block
newblock = f1.read(blockl)
#print('newblocklengte = '+str(len(newblock)))
block = blockrest
while len(block) > 1:
block,taglst = doit(block,taglst)
#print('Blocklengte = '+str(len(block)))
return
def doit(block,taglst):
#print('blockt '+block)
# Remove garbage from the beginning
mo1 = re.match('\s*<\?.*?\?>\s*',block)
if mo1:
block = block[mo1.end():]
# Find a tag
endtag = ''
mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
if mo1:
#print('group0tag '+mo1.group(0))
try:
endtag = mo1.group(2)
except NameError:
pass
block = block[mo1.end():]
#print('mo1group ',mo1.group(1))
mo2 = re.match('\S*',mo1.group(1))
if mo2:
tag = mo2.group(0)
tagl1 = [tag]
# Get the tag attributes
mo2 = re.search('\s+(.*)',mo1.group(1))
if mo2:
attdct = get_attr(mo2.group(0))
attrline = ''
for attr in attdct:
attrline += '.'+attr+'='+attdct[attr]
tagl1.append(attrline[1:])
#print('blockc '+block)
# Find content
mo1 = re.match('([^<].*?)<',block)
if mo1:
#print('group0cont '+mo1.group(0))
block = block[mo1.end()-1:]
content = mo1.group(1)
tagl1.append('='+content)
try:
taglst.append('.'.join(tagl1))
except:
pass
print('/'.join(taglst))
# Find end-tag
if endtag:
popped = taglst.pop()
#print('Popped '+popped)
#print('blocke '+block)
mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
if mo1:
popped = taglst.pop()
#print('Popped '+mo1.group(0))
block = block[mo1.end():]
return (block,taglst)
def get_attr(attrs):
attdct = {}
mo2 = re.finditer('(\w+)=(\S+)',attrs)
if mo2:
for att in mo2:
attdct[att.group(1)] = att.group(2)
return(attdct)
main()
Parse into dict
#!/usr/bin/env python2
def usage():
print ("Simple and pretty fast XML parser for all file sizes")
print ("Usage: "+__file__+" <xmlfiles>")
exit()
return
import os,sys,re,glob
filespec = '*.xml'
blockl = 100000 # Must be large enough to contain an instance of the highest node you are looking for.
def main():
result = read_it(filespec)
print("Heading1\t->\tHeading2t")
for tag1id in result:
for tag2id in result[tag1id]:
print(tag1id+"\t->\t"+tag2id)
return
def read_it(filespec):
result = {}
filelist = glob.glob(filespec)
for filename in filelist:
if os.path.isfile(filename):
with open (filename,"r") as f1:
blockrest = ''
newblock = f1.read(blockl)
# Remove garbage from the beginning of the file
garbage_mo = re.match('\s*<\?.*?\?>\s*',newblock)
if garbage_mo:
newblock = newblock[garbage_mo.end():]
while len(newblock) > 0:
block = blockrest+newblock)
blockrest,result = do_it(block,result)
newblock = f1.read(blockl)
return result
def do_it(block,result):
tag1list = get_node(block,'TAG1')
for tag1 in tag1list:
#Remove processed part of block
#block = block.replace(tag1,'',1)
#or everything from the beginning
dataend = block.find(mecontext)+len(mecontext)
block = block[dataend:]
tag1id = get_attr(tag1,'ID')
if tag1id in result:
pass
else:
result[tag1id] = set()
tag2list = get_node(tag1,'TAG2')
for tag2 in tag2list:
tag2id = get_attr(tag2,'ID')
content = get_content(tag2,'TAG3')
result[tag1id].add(tag2id)
return block,result
def get_node(block,tag):
taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag+'\s*>',block,re.DOTALL)
if len(taglist) == 0:
taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL)
return taglist
def get_content(block,tag):
content = None
content_mo = re.search('<\s*'+tag+'[\s\>].*?>?(.*?)</'+tag+'\s*>',block,re.DOTALL)
if content_mo:
content = content_mo.group(1)
return content
def get_attr(block,attr):
attrvalue = None
attribute_mo = re.search(r' '+attr+'\s*=\s*[\"\'](.*?)[\"\']',block,re.DOTALL)
if attribute_mo:
attrvalue = attribute_mo.group(1)
return(attrvalue)
main()