Difference between revisions of "XLS"
Jump to navigation
Jump to search
Line 57: | Line 57: | ||
z = zipfile.ZipFile(fname) | z = zipfile.ZipFile(fname) | ||
# Get shared strings | # Get shared strings | ||
− | strings = [ | + | strings = [ element.text |
− | + | for event,element in iterparse(z.open('xl/sharedStrings.xml')) | |
− | + | if element.tag.endswith('}t') ] | |
+ | # Get sheetnames and index | ||
+ | sheetdict = { element.attrib['name']:element.attrib['sheetId'] | ||
+ | for event,element in iterparse(z.open('xl/workbook.xml')) | ||
+ | if element.tag.endswith('}sheet') } | ||
+ | |||
rows = [] | rows = [] | ||
row = {} | row = {} | ||
value = '' | value = '' | ||
− | for | + | |
− | + | if sheet in sheetdict: | |
− | + | sheetfile = 'xl/worksheets/sheet'+sheetdict[sheet]+'.xml' | |
− | + | for event, element in iterparse(z.open(sheetfile)): | |
− | + | # get value or index to shared strings | |
− | + | if element.tag.endswith('}v'): | |
− | + | value = element.text | |
− | + | # If value is a shared string, use value as an index | |
− | + | if element.tag.endswith('}c'): | |
− | + | if element.attrib.get('t') == 's': | |
− | + | value = strings[int(value)] | |
− | + | # split the row/col information so that the row leter(s) can be separate | |
− | + | letter = element.attrib['r'] | |
− | + | while letter[-1].isdigit(): | |
− | + | letter = letter[:-1] | |
− | + | row[letter] = value | |
− | + | value = '' | |
+ | if element.tag.endswith('}row'): | ||
+ | rows.append(row) | ||
+ | row = {} | ||
return rows | return rows | ||
</syntaxhighlight> | </syntaxhighlight> |
Revision as of 22:33, 27 October 2018
Python XLS parser using standard modules. xlsx files are basically just a set of compressed (zipped) .XML pages.
A simple workbook with 3 sheets having the same content like this:
Consists of the following .xml files:
Filename | Remarks |
---|---|
./[Content_Types].xml | |
./docProps/core.xml | |
./docProps/app.xml | |
./xl/workbook.xml | Holds per sheet a node like '<sheet name="Test2" sheetId="2" state="visible" r:id="rId3"/>' |
./xl/sharedStrings.xml | Strings that appear in more sheets, references from the sheet by indexnumber. |
./xl/styles.xml | |
./xl/worksheets/sheet2.xml | Content of sheet2 |
./xl/worksheets/sheet1.xml | Content of sheet1 |
./xl/worksheets/sheet3.xml | Content of sheet3 |
./xl/_rels/workbook.xml.rels | |
./_rels/.rels |
Found this on stackoverflow. It is announced as very very basic but it works very well.
This routine returns a list of dicts like { <columname> : <cellvalue> [, .. ] } The original, with sharedStrings:
def xlsx(fname):
import zipfile
from xml.etree.ElementTree import iterparse
z = zipfile.ZipFile(fname)
# Get shared strings
strings = [ element.text
for event,element in iterparse(z.open('xl/sharedStrings.xml'))
if element.tag.endswith('}t') ]
# Get sheetnames and index
sheetdict = { element.attrib['name']:element.attrib['sheetId']
for event,element in iterparse(z.open('xl/workbook.xml'))
if element.tag.endswith('}sheet') }
rows = []
row = {}
value = ''
if sheet in sheetdict:
sheetfile = 'xl/worksheets/sheet'+sheetdict[sheet]+'.xml'
for event, element in iterparse(z.open(sheetfile)):
# get value or index to shared strings
if element.tag.endswith('}v'):
value = element.text
# If value is a shared string, use value as an index
if element.tag.endswith('}c'):
if element.attrib.get('t') == 's':
value = strings[int(value)]
# split the row/col information so that the row leter(s) can be separate
letter = element.attrib['r']
while letter[-1].isdigit():
letter = letter[:-1]
row[letter] = value
value = ''
if element.tag.endswith('}row'):
rows.append(row)
row = {}
return rows
Adapted for sheets without sharedStrings:
def xlsx(fname):
import zipfile
from xml.etree.ElementTree import iterparse
z = zipfile.ZipFile(fname)
# Get shared strings
#strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml'))
# if el.tag.endswith('}t')]
rows = []
row = {}
value = ''
for e, el in iterparse(z.open('xl/worksheets/sheet7.xml')):
# get value or index to shared strings
if el.tag.endswith('}v'): # <v>84</v>
value = el.text
if el.tag.endswith('}t'): # <t>String</t>
value = el.text
# If value is a shared string, use value as an index
if el.tag.endswith('}c'): # <c r="A3" t="s"><v>84</v></c>
if el.attrib.get('t') == 's':
value = strings[int(value)]
# split the row/col information so that the row leter(s) can be separate
letter = el.attrib['r'] # AZ22
while letter[-1].isdigit():
letter = letter[:-1]
row[letter] = value
value = ''
if el.tag.endswith('}row'):
rows.append(row)
row = {}
return rows