Difference between revisions of "Python:XML"
Jump to navigation
Jump to search
(Created page with ";from xml.etree import ElementTree :Load XML-support ;tree = get_tree(filename) :Read the file into an XML-tree ;root = tree.getroot() :Get the tree root element object ;fo...") |
|||
(16 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | ;from xml.etree import ElementTree | + | [[Category:Python]] |
+ | TODO: | ||
+ | * Check [http://effbot.org/zone/element-iterparse.htm interparse] for parsing large files. | ||
+ | * Check lxml.etree as, [https://lxml.de/1.3/compatibility.html according to the developers], it offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer. | ||
+ | |||
+ | [[XML|Generic page on XML]] | ||
+ | |||
+ | ;Convert XML to a dict | ||
+ | <syntaxhighlight lang=python> | ||
+ | import xmltodict | ||
+ | dict1 = xmltodict.parse(xml-string) | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ;from xml.etree import ElementTree as ET | ||
:Load XML-support | :Load XML-support | ||
− | ;tree = | + | ;tree = ET.parse(filename) |
− | :Read the file into an XML-tree | + | :Read the file or an xml-sting into an XML-tree |
;root = tree.getroot() | ;root = tree.getroot() | ||
+ | ;root = ET.fromstring(xml-string) | ||
:Get the tree root element object | :Get the tree root element object | ||
Line 16: | Line 30: | ||
:<tag attribname=attrib>text</tag> | :<tag attribname=attrib>text</tag> | ||
+ | ;element.find(elementname) | ||
+ | :Get first subelement by name (element can be root too) | ||
+ | Code example: | ||
<syntaxhighlight lang=python> | <syntaxhighlight lang=python> | ||
#!/usr/bin/env python3 | #!/usr/bin/env python3 | ||
Line 28: | Line 45: | ||
import os | import os | ||
import sys | import sys | ||
− | from xml.etree import ElementTree | + | from xml.etree import ElementTree as ET |
def main(): | def main(): | ||
Line 51: | Line 68: | ||
if os.path.isfile(filename): | if os.path.isfile(filename): | ||
#with open(filename, 'r') as f: | #with open(filename, 'r') as f: | ||
− | # tree = | + | # tree = ET.parse(f) |
− | tree = | + | tree = ET.parse(filename) |
else: | else: | ||
usage() | usage() | ||
return(tree) | return(tree) | ||
+ | main() | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ===Flatten XML=== | ||
+ | Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available. This is very slow if large file are processed (interparse may be helpfull here) | ||
+ | |||
+ | Output format: | ||
+ | tag1 | ||
+ | tag1/tag2.attriv.=value | ||
+ | tag1/tag2.attrib.=value/tag3 content | ||
+ | |||
+ | <syntaxhighlight lang=python> | ||
+ | #!/usr/bin/env python3 | ||
+ | |||
+ | def usage(): | ||
+ | print ("Convert any XML to 1 line per leave including attributes and content") | ||
+ | print ("Usage: "+__file__+" [<xmlfiles>]") | ||
+ | exit() | ||
+ | return | ||
+ | |||
+ | import os | ||
+ | import sys | ||
+ | |||
+ | scriptname = os.path.abspath(__file__) | ||
+ | |||
+ | ## end of standard part ## | ||
+ | |||
+ | import glob,re | ||
+ | |||
+ | blockl = 65536 | ||
+ | minblock = 256 | ||
+ | |||
+ | def main(): | ||
+ | files = glob.glob(' '.join(sys.argv[1:])) | ||
+ | for filename in files: | ||
+ | if os.path.isfile(filename): | ||
+ | f1 = open (filename,"r") | ||
+ | if f1: | ||
+ | print('File is open') | ||
+ | readit(f1) | ||
+ | return | ||
+ | |||
+ | def readit(f1): | ||
+ | eof = 0 | ||
+ | taglst = [] | ||
+ | blockrest = '' | ||
+ | newblock = f1.read(blockl) | ||
+ | while len(newblock) > 0: | ||
+ | block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r','')) | ||
+ | while len(block) > minblock: | ||
+ | block,taglst = doit(block,taglst) | ||
+ | blockrest = block | ||
+ | newblock = f1.read(blockl) | ||
+ | #print('newblocklengte = '+str(len(newblock))) | ||
+ | block = blockrest | ||
+ | while len(block) > 1: | ||
+ | block,taglst = doit(block,taglst) | ||
+ | #print('Blocklengte = '+str(len(block))) | ||
+ | return | ||
+ | |||
+ | def doit(block,taglst): | ||
+ | #print('blockt '+block) | ||
+ | # Remove garbage from the beginning | ||
+ | mo1 = re.match('\s*<\?.*?\?>\s*',block) | ||
+ | if mo1: | ||
+ | block = block[mo1.end():] | ||
+ | # Find a tag | ||
+ | endtag = '' | ||
+ | mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block) | ||
+ | if mo1: | ||
+ | #print('group0tag '+mo1.group(0)) | ||
+ | try: | ||
+ | endtag = mo1.group(2) | ||
+ | except NameError: | ||
+ | pass | ||
+ | block = block[mo1.end():] | ||
+ | #print('mo1group ',mo1.group(1)) | ||
+ | mo2 = re.match('\S*',mo1.group(1)) | ||
+ | if mo2: | ||
+ | tag = mo2.group(0) | ||
+ | tagl1 = [tag] | ||
+ | # Get the tag attributes | ||
+ | mo2 = re.search('\s+(.*)',mo1.group(1)) | ||
+ | if mo2: | ||
+ | attdct = get_attr(mo2.group(0)) | ||
+ | attrline = '' | ||
+ | for attr in attdct: | ||
+ | attrline += '.'+attr+'='+attdct[attr] | ||
+ | tagl1.append(attrline[1:]) | ||
+ | #print('blockc '+block) | ||
+ | # Find content | ||
+ | mo1 = re.match('([^<].*?)<',block) | ||
+ | if mo1: | ||
+ | #print('group0cont '+mo1.group(0)) | ||
+ | block = block[mo1.end()-1:] | ||
+ | content = mo1.group(1) | ||
+ | tagl1.append('='+content) | ||
+ | |||
+ | try: | ||
+ | taglst.append('.'.join(tagl1)) | ||
+ | except: | ||
+ | pass | ||
+ | print('/'.join(taglst)) | ||
+ | # Find end-tag | ||
+ | if endtag: | ||
+ | popped = taglst.pop() | ||
+ | #print('Popped '+popped) | ||
+ | #print('blocke '+block) | ||
+ | mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block) | ||
+ | if mo1: | ||
+ | popped = taglst.pop() | ||
+ | #print('Popped '+mo1.group(0)) | ||
+ | block = block[mo1.end():] | ||
+ | |||
+ | return (block,taglst) | ||
+ | |||
+ | def get_attr(attrs): | ||
+ | attdct = {} | ||
+ | mo2 = re.finditer('(\w+)=(\S+)',attrs) | ||
+ | if mo2: | ||
+ | for att in mo2: | ||
+ | attdct[att.group(1)] = att.group(2) | ||
+ | return(attdct) | ||
+ | |||
+ | main() | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ===Parse into [[Python:DataTypes#dict|dict]]=== | ||
+ | |||
+ | <syntaxhighlight lang=python> | ||
+ | #!/usr/bin/env python2 | ||
+ | |||
+ | def usage(): | ||
+ | print ("Simple and pretty fast XML parser for all file sizes") | ||
+ | print ("Usage: "+__file__+" <xmlfiles>") | ||
+ | exit() | ||
+ | return | ||
+ | import os,sys,re,glob | ||
+ | |||
+ | |||
+ | filespec = '*.xml' | ||
+ | blockl = 100000 # Must be large enough to contain an instance of the highest node you are looking for. | ||
+ | |||
+ | |||
+ | def main(): | ||
+ | result = read_it(filespec) | ||
+ | print("Heading1\t->\tHeading2t") | ||
+ | for tag1id in result: | ||
+ | for tag2id in result[tag1id]: | ||
+ | print(tag1id+"\t->\t"+tag2id) | ||
+ | return | ||
+ | |||
+ | |||
+ | def read_it(filespec): | ||
+ | result = {} | ||
+ | filelist = glob.glob(filespec) | ||
+ | for filename in filelist: | ||
+ | if os.path.isfile(filename): | ||
+ | with open (filename,"r") as f1: | ||
+ | blockrest = '' | ||
+ | newblock = f1.read(blockl) | ||
+ | # Remove garbage from the beginning of the file | ||
+ | garbage_mo = re.match('\s*<\?.*?\?>\s*',newblock) | ||
+ | if garbage_mo: | ||
+ | newblock = newblock[garbage_mo.end():] | ||
+ | while len(newblock) > 0: | ||
+ | block = blockrest+newblock) | ||
+ | blockrest,result = do_it(block,result) | ||
+ | newblock = f1.read(blockl) | ||
+ | return result | ||
+ | |||
+ | |||
+ | def do_it(block,result): | ||
+ | tag1list = get_node(block,'TAG1') | ||
+ | for tag1 in tag1list: | ||
+ | #Remove processed part of block | ||
+ | #block = block.replace(tag1,'',1) | ||
+ | #or everything from the beginning | ||
+ | dataend = block.find(mecontext)+len(mecontext) | ||
+ | block = block[dataend:] | ||
+ | |||
+ | tag1id = get_attr(tag1,'ID') | ||
+ | if tag1id in result: | ||
+ | pass | ||
+ | else: | ||
+ | result[tag1id] = set() | ||
+ | tag2list = get_node(tag1,'TAG2') | ||
+ | for tag2 in tag2list: | ||
+ | tag2id = get_attr(tag2,'ID') | ||
+ | content = get_content(tag2,'TAG3') | ||
+ | result[tag1id].add(tag2id) | ||
+ | return block,result | ||
+ | |||
+ | |||
+ | def get_node(block,tag): | ||
+ | taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag+'\s*>',block,re.DOTALL) | ||
+ | if len(taglist) == 0: | ||
+ | taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL) | ||
+ | return taglist | ||
+ | |||
+ | |||
+ | def get_content(block,tag): | ||
+ | content = None | ||
+ | content_mo = re.search('<\s*'+tag+'[\s\>].*?>?(.*?)</'+tag+'\s*>',block,re.DOTALL) | ||
+ | if content_mo: | ||
+ | content = content_mo.group(1) | ||
+ | return content | ||
+ | |||
+ | |||
+ | def get_attr(block,attr): | ||
+ | attrvalue = None | ||
+ | attribute_mo = re.search(r' '+attr+'\s*=\s*[\"\'](.*?)[\"\']',block,re.DOTALL) | ||
+ | if attribute_mo: | ||
+ | attrvalue = attribute_mo.group(1) | ||
+ | return(attrvalue) | ||
+ | |||
+ | |||
main() | main() | ||
</syntaxhighlight> | </syntaxhighlight> |
Latest revision as of 16:42, 11 August 2022
TODO:
- Check interparse for parsing large files.
- Check lxml.etree as, according to the developers, it offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer.
- Convert XML to a dict
import xmltodict
dict1 = xmltodict.parse(xml-string)
- from xml.etree import ElementTree as ET
- Load XML-support
- tree = ET.parse(filename)
- Read the file or an xml-sting into an XML-tree
- root = tree.getroot()
- root = ET.fromstring(xml-string)
- Get the tree root element object
- for element in root
- Get the elements in root
- element.tag
- element.attrib
- element.text
- <tag attribname=attrib>text</tag>
- element.find(elementname)
- Get first subelement by name (element can be root too)
Code example:
#!/usr/bin/env python3
def usage():
print ("Print XML tags, attributes and text")
print ("Usage: "+__file__+" <xmlfile>")
exit()
return
import os
import sys
from xml.etree import ElementTree as ET
def main():
if len(sys.argv) == 2:
filename = sys.argv[1]
else:
usage()
tree = get_tree(filename)
root = tree.getroot()
get_all(root)
print(root)
return
def get_all(element,prefix=""):
prefix = prefix+"\t"
for child in element:
print (prefix,child.tag,child.attrib,child.text)
get_all(child,prefix)
return
def get_tree(filename):
if os.path.isfile(filename):
#with open(filename, 'r') as f:
# tree = ET.parse(f)
tree = ET.parse(filename)
else:
usage()
return(tree)
main()
Flatten XML
Below code converts any XML file into lines per leaf. It processes the files tag by tag and does not read the entire file into memory, therefor it can handle very large files regardless the memory available. This is very slow if large file are processed (interparse may be helpfull here)
Output format:
tag1 tag1/tag2.attriv.=value tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3
def usage():
print ("Convert any XML to 1 line per leave including attributes and content")
print ("Usage: "+__file__+" [<xmlfiles>]")
exit()
return
import os
import sys
scriptname = os.path.abspath(__file__)
## end of standard part ##
import glob,re
blockl = 65536
minblock = 256
def main():
files = glob.glob(' '.join(sys.argv[1:]))
for filename in files:
if os.path.isfile(filename):
f1 = open (filename,"r")
if f1:
print('File is open')
readit(f1)
return
def readit(f1):
eof = 0
taglst = []
blockrest = ''
newblock = f1.read(blockl)
while len(newblock) > 0:
block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
while len(block) > minblock:
block,taglst = doit(block,taglst)
blockrest = block
newblock = f1.read(blockl)
#print('newblocklengte = '+str(len(newblock)))
block = blockrest
while len(block) > 1:
block,taglst = doit(block,taglst)
#print('Blocklengte = '+str(len(block)))
return
def doit(block,taglst):
#print('blockt '+block)
# Remove garbage from the beginning
mo1 = re.match('\s*<\?.*?\?>\s*',block)
if mo1:
block = block[mo1.end():]
# Find a tag
endtag = ''
mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
if mo1:
#print('group0tag '+mo1.group(0))
try:
endtag = mo1.group(2)
except NameError:
pass
block = block[mo1.end():]
#print('mo1group ',mo1.group(1))
mo2 = re.match('\S*',mo1.group(1))
if mo2:
tag = mo2.group(0)
tagl1 = [tag]
# Get the tag attributes
mo2 = re.search('\s+(.*)',mo1.group(1))
if mo2:
attdct = get_attr(mo2.group(0))
attrline = ''
for attr in attdct:
attrline += '.'+attr+'='+attdct[attr]
tagl1.append(attrline[1:])
#print('blockc '+block)
# Find content
mo1 = re.match('([^<].*?)<',block)
if mo1:
#print('group0cont '+mo1.group(0))
block = block[mo1.end()-1:]
content = mo1.group(1)
tagl1.append('='+content)
try:
taglst.append('.'.join(tagl1))
except:
pass
print('/'.join(taglst))
# Find end-tag
if endtag:
popped = taglst.pop()
#print('Popped '+popped)
#print('blocke '+block)
mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
if mo1:
popped = taglst.pop()
#print('Popped '+mo1.group(0))
block = block[mo1.end():]
return (block,taglst)
def get_attr(attrs):
attdct = {}
mo2 = re.finditer('(\w+)=(\S+)',attrs)
if mo2:
for att in mo2:
attdct[att.group(1)] = att.group(2)
return(attdct)
main()
Parse into dict
#!/usr/bin/env python2
def usage():
print ("Simple and pretty fast XML parser for all file sizes")
print ("Usage: "+__file__+" <xmlfiles>")
exit()
return
import os,sys,re,glob
filespec = '*.xml'
blockl = 100000 # Must be large enough to contain an instance of the highest node you are looking for.
def main():
result = read_it(filespec)
print("Heading1\t->\tHeading2t")
for tag1id in result:
for tag2id in result[tag1id]:
print(tag1id+"\t->\t"+tag2id)
return
def read_it(filespec):
result = {}
filelist = glob.glob(filespec)
for filename in filelist:
if os.path.isfile(filename):
with open (filename,"r") as f1:
blockrest = ''
newblock = f1.read(blockl)
# Remove garbage from the beginning of the file
garbage_mo = re.match('\s*<\?.*?\?>\s*',newblock)
if garbage_mo:
newblock = newblock[garbage_mo.end():]
while len(newblock) > 0:
block = blockrest+newblock)
blockrest,result = do_it(block,result)
newblock = f1.read(blockl)
return result
def do_it(block,result):
tag1list = get_node(block,'TAG1')
for tag1 in tag1list:
#Remove processed part of block
#block = block.replace(tag1,'',1)
#or everything from the beginning
dataend = block.find(mecontext)+len(mecontext)
block = block[dataend:]
tag1id = get_attr(tag1,'ID')
if tag1id in result:
pass
else:
result[tag1id] = set()
tag2list = get_node(tag1,'TAG2')
for tag2 in tag2list:
tag2id = get_attr(tag2,'ID')
content = get_content(tag2,'TAG3')
result[tag1id].add(tag2id)
return block,result
def get_node(block,tag):
taglist = re.findall('<\s*'+tag+'[\s\>].*?</'+tag+'\s*>',block,re.DOTALL)
if len(taglist) == 0:
taglist = re.findall('<\s*'+tag+'.*?/>',block,re.DOTALL)
return taglist
def get_content(block,tag):
content = None
content_mo = re.search('<\s*'+tag+'[\s\>].*?>?(.*?)</'+tag+'\s*>',block,re.DOTALL)
if content_mo:
content = content_mo.group(1)
return content
def get_attr(block,attr):
attrvalue = None
attribute_mo = re.search(r' '+attr+'\s*=\s*[\"\'](.*?)[\"\']',block,re.DOTALL)
if attribute_mo:
attrvalue = attribute_mo.group(1)
return(attrvalue)
main()