Difference between revisions of "Python:XML"
Jump to navigation
Jump to search
m (→Flatten XML) |
|||
Line 61: | Line 61: | ||
===Flatten XML=== | ===Flatten XML=== | ||
− | Below code converts any XML file into lines per leaf. | + | Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available. |
+ | |||
Output format: | Output format: | ||
tag1 | tag1 | ||
Line 118: | Line 119: | ||
def doit(block,taglst): | def doit(block,taglst): | ||
#print('blockt '+block) | #print('blockt '+block) | ||
− | |||
# Remove garbage from the beginning | # Remove garbage from the beginning | ||
mo1 = re.match('\s*<\?.*?\?>\s*',block) | mo1 = re.match('\s*<\?.*?\?>\s*',block) |
Revision as of 23:21, 27 January 2018
- from xml.etree import ElementTree
- Load XML-support
- tree = get_tree(filename)
- Read the file into an XML-tree
- root = tree.getroot()
- Get the tree root element object
- for element in root
- Get the elements in root
- element.tag
- element.attrib
- element.text
- <tag attribname=attrib>text</tag>
Code example:
#!/usr/bin/env python3
def usage():
print ("Print XML tags, attributes and text")
print ("Usage: "+__file__+" <xmlfile>")
exit()
return
import os
import sys
from xml.etree import ElementTree
def main():
if len(sys.argv) == 2:
filename = sys.argv[1]
else:
usage()
tree = get_tree(filename)
root = tree.getroot()
get_all(root)
print(root)
return
def get_all(element,prefix=""):
prefix = prefix+"\t"
for child in element:
print (prefix,child.tag,child.attrib,child.text)
get_all(child,prefix)
return
def get_tree(filename):
if os.path.isfile(filename):
#with open(filename, 'r') as f:
# tree = ElementTree.parse(f)
tree = ElementTree.parse(filename)
else:
usage()
return(tree)
main()
Flatten XML
Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available.
Output format:
tag1 tag1/tag2.attriv.=value tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3
def usage():
print ("Convert any XML to 1 line per leave including attributes and content")
print ("Usage: "+__file__+" [<xmlfiles>]")
exit()
return
import os
import sys
scriptname = os.path.abspath(__file__)
## end of standard part ##
import glob,re
blockl = 65536
minblock = 256
def main():
files = glob.glob(' '.join(sys.argv[1:]))
for filename in files:
if os.path.isfile(filename):
f1 = open (filename,"r")
if f1:
print('File is open')
readit(f1)
return
def readit(f1):
eof = 0
taglst = []
blockrest = ''
newblock = f1.read(blockl)
while len(newblock) > 0:
block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
while len(block) > minblock:
block,taglst = doit(block,taglst)
blockrest = block
newblock = f1.read(blockl)
#print('newblocklengte = '+str(len(newblock)))
block = blockrest
while len(block) > 1:
block,taglst = doit(block,taglst)
#print('Blocklengte = '+str(len(block)))
return
def doit(block,taglst):
#print('blockt '+block)
# Remove garbage from the beginning
mo1 = re.match('\s*<\?.*?\?>\s*',block)
if mo1:
block = block[mo1.end():]
# Find a tag
endtag = ''
mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
if mo1:
#print('group0tag '+mo1.group(0))
try:
endtag = mo1.group(2)
except NameError:
pass
block = block[mo1.end():]
#print('mo1group ',mo1.group(1))
mo2 = re.match('\S*',mo1.group(1))
if mo2:
tag = mo2.group(0)
tagl1 = [tag]
# Get the tag attributes
mo2 = re.search('\s+(.*)',mo1.group(1))
if mo2:
attdct = get_attr(mo2.group(0))
attrline = ''
for attr in attdct:
attrline += '.'+attr+'='+attdct[attr]
tagl1.append(attrline[1:])
#print('blockc '+block)
# Find content
mo1 = re.match('([^<].*?)<',block)
if mo1:
#print('group0cont '+mo1.group(0))
block = block[mo1.end()-1:]
content = mo1.group(1)
tagl1.append('='+content)
try:
taglst.append('.'.join(tagl1))
except:
pass
print('/'.join(taglst))
# Find end-tag
if endtag:
popped = taglst.pop()
#print('Popped '+popped)
#print('blocke '+block)
mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
if mo1:
popped = taglst.pop()
#print('Popped '+mo1.group(0))
block = block[mo1.end():]
return (block,taglst)
def get_attr(attrs):
attdct = {}
mo2 = re.finditer('(\w+)=(\S+)',attrs)
if mo2:
for att in mo2:
attdct[att.group(1)] = att.group(2)
return(attdct)
main()