Difference between revisions of "Python:XML"
Jump to navigation
Jump to search
(Created page with ";from xml.etree import ElementTree :Load XML-support ;tree = get_tree(filename) :Read the file into an XML-tree ;root = tree.getroot() :Get the tree root element object ;fo...") |
|||
Line 16: | Line 16: | ||
:<tag attribname=attrib>text</tag> | :<tag attribname=attrib>text</tag> | ||
− | + | Code example: | |
<syntaxhighlight lang=python> | <syntaxhighlight lang=python> | ||
#!/usr/bin/env python3 | #!/usr/bin/env python3 | ||
Line 56: | Line 56: | ||
usage() | usage() | ||
return(tree) | return(tree) | ||
+ | |||
+ | main() | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ===Flatten XML=== | ||
+ | Below code converts any XML file into lines per leaf. | ||
+ | Output format: | ||
+ | tag1 | ||
+ | tag1/tag2.attriv.=value | ||
+ | tag1/tag2.attrib.=value/tag3 content | ||
+ | |||
+ | <syntaxhighlight lang=python> | ||
+ | #!/usr/bin/env python3 | ||
+ | |||
+ | def usage(): | ||
+ | print ("Convert any XML to 1 line per leave including attributes and content") | ||
+ | print ("Usage: "+__file__+" [<xmlfiles>]") | ||
+ | exit() | ||
+ | return | ||
+ | |||
+ | import os | ||
+ | import sys | ||
+ | |||
+ | scriptname = os.path.abspath(__file__) | ||
+ | |||
+ | ## end of standard part ## | ||
+ | |||
+ | import glob,re | ||
+ | |||
+ | blockl = 65536 | ||
+ | minblock = 256 | ||
+ | |||
+ | def main(): | ||
+ | files = glob.glob(' '.join(sys.argv[1:])) | ||
+ | for filename in files: | ||
+ | if os.path.isfile(filename): | ||
+ | f1 = open (filename,"r") | ||
+ | if f1: | ||
+ | print('File is open') | ||
+ | readit(f1) | ||
+ | return | ||
+ | |||
+ | def readit(f1): | ||
+ | eof = 0 | ||
+ | taglst = [] | ||
+ | blockrest = '' | ||
+ | newblock = f1.read(blockl) | ||
+ | while len(newblock) > 0: | ||
+ | block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r','')) | ||
+ | while len(block) > minblock: | ||
+ | block,taglst = doit(block,taglst) | ||
+ | blockrest = block | ||
+ | newblock = f1.read(blockl) | ||
+ | #print('newblocklengte = '+str(len(newblock))) | ||
+ | block = blockrest | ||
+ | while len(block) > 1: | ||
+ | block,taglst = doit(block,taglst) | ||
+ | #print('Blocklengte = '+str(len(block))) | ||
+ | return | ||
+ | |||
+ | def doit(block,taglst): | ||
+ | #print('blockt '+block) | ||
+ | #mo1 = re.finditer('<([^\?]\S*?)(\s+.*?=.*?)?>([^<].*?<)?(\/.*?>)?',block) | ||
+ | # Remove garbage from the beginning | ||
+ | mo1 = re.match('\s*<\?.*?\?>\s*',block) | ||
+ | if mo1: | ||
+ | block = block[mo1.end():] | ||
+ | # Find a tag | ||
+ | endtag = '' | ||
+ | mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block) | ||
+ | if mo1: | ||
+ | #print('group0tag '+mo1.group(0)) | ||
+ | try: | ||
+ | endtag = mo1.group(2) | ||
+ | except NameError: | ||
+ | pass | ||
+ | block = block[mo1.end():] | ||
+ | #print('mo1group ',mo1.group(1)) | ||
+ | mo2 = re.match('\S*',mo1.group(1)) | ||
+ | if mo2: | ||
+ | tag = mo2.group(0) | ||
+ | tagl1 = [tag] | ||
+ | # Get the tag attributes | ||
+ | mo2 = re.search('\s+(.*)',mo1.group(1)) | ||
+ | if mo2: | ||
+ | attdct = get_attr(mo2.group(0)) | ||
+ | attrline = '' | ||
+ | for attr in attdct: | ||
+ | attrline += '.'+attr+'='+attdct[attr] | ||
+ | tagl1.append(attrline[1:]) | ||
+ | #print('blockc '+block) | ||
+ | # Find content | ||
+ | mo1 = re.match('([^<].*?)<',block) | ||
+ | if mo1: | ||
+ | #print('group0cont '+mo1.group(0)) | ||
+ | block = block[mo1.end()-1:] | ||
+ | content = mo1.group(1) | ||
+ | tagl1.append('='+content) | ||
+ | |||
+ | try: | ||
+ | taglst.append('.'.join(tagl1)) | ||
+ | except: | ||
+ | pass | ||
+ | print('/'.join(taglst)) | ||
+ | # Find end-tag | ||
+ | if endtag: | ||
+ | popped = taglst.pop() | ||
+ | #print('Popped '+popped) | ||
+ | #print('blocke '+block) | ||
+ | mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block) | ||
+ | if mo1: | ||
+ | popped = taglst.pop() | ||
+ | #print('Popped '+mo1.group(0)) | ||
+ | block = block[mo1.end():] | ||
+ | |||
+ | return (block,taglst) | ||
+ | |||
+ | def get_attr(attrs): | ||
+ | attdct = {} | ||
+ | mo2 = re.finditer('(\w+)=(\S+)',attrs) | ||
+ | if mo2: | ||
+ | for att in mo2: | ||
+ | attdct[att.group(1)] = att.group(2) | ||
+ | return(attdct) | ||
main() | main() | ||
</syntaxhighlight> | </syntaxhighlight> |
Revision as of 23:15, 27 January 2018
- from xml.etree import ElementTree
- Load XML-support
- tree = get_tree(filename)
- Read the file into an XML-tree
- root = tree.getroot()
- Get the tree root element object
- for element in root
- Get the elements in root
- element.tag
- element.attrib
- element.text
- <tag attribname=attrib>text</tag>
Code example:
#!/usr/bin/env python3
def usage():
print ("Print XML tags, attributes and text")
print ("Usage: "+__file__+" <xmlfile>")
exit()
return
import os
import sys
from xml.etree import ElementTree
def main():
if len(sys.argv) == 2:
filename = sys.argv[1]
else:
usage()
tree = get_tree(filename)
root = tree.getroot()
get_all(root)
print(root)
return
def get_all(element,prefix=""):
prefix = prefix+"\t"
for child in element:
print (prefix,child.tag,child.attrib,child.text)
get_all(child,prefix)
return
def get_tree(filename):
if os.path.isfile(filename):
#with open(filename, 'r') as f:
# tree = ElementTree.parse(f)
tree = ElementTree.parse(filename)
else:
usage()
return(tree)
main()
Flatten XML
Below code converts any XML file into lines per leaf. Output format:
tag1 tag1/tag2.attriv.=value tag1/tag2.attrib.=value/tag3 content
#!/usr/bin/env python3
def usage():
print ("Convert any XML to 1 line per leave including attributes and content")
print ("Usage: "+__file__+" [<xmlfiles>]")
exit()
return
import os
import sys
scriptname = os.path.abspath(__file__)
## end of standard part ##
import glob,re
blockl = 65536
minblock = 256
def main():
files = glob.glob(' '.join(sys.argv[1:]))
for filename in files:
if os.path.isfile(filename):
f1 = open (filename,"r")
if f1:
print('File is open')
readit(f1)
return
def readit(f1):
eof = 0
taglst = []
blockrest = ''
newblock = f1.read(blockl)
while len(newblock) > 0:
block = re.sub('>\s*<','><',blockrest+newblock.replace('\n','').replace('\r',''))
while len(block) > minblock:
block,taglst = doit(block,taglst)
blockrest = block
newblock = f1.read(blockl)
#print('newblocklengte = '+str(len(newblock)))
block = blockrest
while len(block) > 1:
block,taglst = doit(block,taglst)
#print('Blocklengte = '+str(len(block)))
return
def doit(block,taglst):
#print('blockt '+block)
#mo1 = re.finditer('<([^\?]\S*?)(\s+.*?=.*?)?>([^<].*?<)?(\/.*?>)?',block)
# Remove garbage from the beginning
mo1 = re.match('\s*<\?.*?\?>\s*',block)
if mo1:
block = block[mo1.end():]
# Find a tag
endtag = ''
mo1 = re.match('\s*<\s*([^\/\?].*?)(\/)?>',block)
if mo1:
#print('group0tag '+mo1.group(0))
try:
endtag = mo1.group(2)
except NameError:
pass
block = block[mo1.end():]
#print('mo1group ',mo1.group(1))
mo2 = re.match('\S*',mo1.group(1))
if mo2:
tag = mo2.group(0)
tagl1 = [tag]
# Get the tag attributes
mo2 = re.search('\s+(.*)',mo1.group(1))
if mo2:
attdct = get_attr(mo2.group(0))
attrline = ''
for attr in attdct:
attrline += '.'+attr+'='+attdct[attr]
tagl1.append(attrline[1:])
#print('blockc '+block)
# Find content
mo1 = re.match('([^<].*?)<',block)
if mo1:
#print('group0cont '+mo1.group(0))
block = block[mo1.end()-1:]
content = mo1.group(1)
tagl1.append('='+content)
try:
taglst.append('.'.join(tagl1))
except:
pass
print('/'.join(taglst))
# Find end-tag
if endtag:
popped = taglst.pop()
#print('Popped '+popped)
#print('blocke '+block)
mo1 = re.match('\s*(<\/.*?>|<\w*?\/>)',block)
if mo1:
popped = taglst.pop()
#print('Popped '+mo1.group(0))
block = block[mo1.end():]
return (block,taglst)
def get_attr(attrs):
attdct = {}
mo2 = re.finditer('(\w+)=(\S+)',attrs)
if mo2:
for att in mo2:
attdct[att.group(1)] = att.group(2)
return(attdct)
main()