Python:XML
- from xml.etree import ElementTree
- Load XML-support
- tree = get_tree(filename)
- Read the file into an XML-tree
- root = tree.getroot()
- Get the tree root element object
- for element in root
- Get the elements in root
- element.tag
- element.attrib
- element.text
- <tag attribname=attrib>text</tag>
Code example:
#!/usr/bin/env python3
def usage():
print ("Print XML tags, attributes and text")
print ("Usage: "+__file__+" <xmlfile>")
exit()
return
import os
import sys
from xml.etree import ElementTree
def main():
if len(sys.argv) == 2:
filename = sys.argv[1]
else:
usage()
tree = get_tree(filename)
root = tree.getroot()
get_all(root)
print(root)
return
def get_all(element,prefix=""):
prefix = prefix+"\t"
for child in element:
print (prefix,child.tag,child.attrib,child.text)
get_all(child,prefix)
return
def get_tree(filename):
if os.path.isfile(filename):
#with open(filename, 'r') as f:
# tree = ElementTree.parse(f)
tree = ElementTree.parse(filename)
else:
usage()
return(tree)
main()
Flatten XML
Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available.
Output format:
tag1 tag1/tag2.attriv.=value tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3
def usage():
print ("Convert any XML to 1 line per leave including attributes and content")
print ("Usage: "+__file__+" [<xmlfiles>]")
exit()
return
import os
import sys
scriptname = os.path.abspath(__file__)
## end of standard part ##
import glob,re
blockl = 65536
minblock = 256
def main():
files = glob.glob(' '.join(sys.argv[1:]))
for filename in files:
if os.path.isfile(filename):
f1 = open (filename,"r")
if f1:
print('File is open')
readit(f1)
return
def readit(f1):
eof = 0
taglst = []
blockrest = ''
newblock = f1.read(blockl)
while len(newblock) > 0:
block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
while len(block) > minblock:
block,taglst = doit(block,taglst)
blockrest = block
newblock = f1.read(blockl)
#print('newblocklengte = '+str(len(newblock)))
block = blockrest
while len(block) > 1:
block,taglst = doit(block,taglst)
#print('Blocklengte = '+str(len(block)))
return
def doit(block,taglst):
#print('blockt '+block)
# Remove garbage from the beginning
mo1 = re.match('\s*<\?.*?\?>\s*',block)
if mo1:
block = block[mo1.end():]
# Find a tag
endtag = ''
mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
if mo1:
#print('group0tag '+mo1.group(0))
try:
endtag = mo1.group(2)
except NameError:
pass
block = block[mo1.end():]
#print('mo1group ',mo1.group(1))
mo2 = re.match('\S*',mo1.group(1))
if mo2:
tag = mo2.group(0)
tagl1 = [tag]
# Get the tag attributes
mo2 = re.search('\s+(.*)',mo1.group(1))
if mo2:
attdct = get_attr(mo2.group(0))
attrline = ''
for attr in attdct:
attrline += '.'+attr+'='+attdct[attr]
tagl1.append(attrline[1:])
#print('blockc '+block)
# Find content
mo1 = re.match('([^<].*?)<',block)
if mo1:
#print('group0cont '+mo1.group(0))
block = block[mo1.end()-1:]
content = mo1.group(1)
tagl1.append('='+content)
try:
taglst.append('.'.join(tagl1))
except:
pass
print('/'.join(taglst))
# Find end-tag
if endtag:
popped = taglst.pop()
#print('Popped '+popped)
#print('blocke '+block)
mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
if mo1:
popped = taglst.pop()
#print('Popped '+mo1.group(0))
block = block[mo1.end():]
return (block,taglst)
def get_attr(attrs):
attdct = {}
mo2 = re.finditer('(\w+)=(\S+)',attrs)
if mo2:
for att in mo2:
attdct[att.group(1)] = att.group(2)
return(attdct)
main()