XLS
Jump to navigation
Jump to search
Python XLS parser using standard modules. xlsx files are basically just a set of compressed (zipped) .XML pages.
A simple workbook with 3 sheets having the same content like this:
Consists of the following .xml files:
Filename | Remarks |
---|---|
./[Content_Types].xml | |
./docProps/core.xml | |
./docProps/app.xml | |
./xl/workbook.xml | Holds per sheet a node like '<sheet name="Test2" sheetId="2" state="visible" r:id="rId3"/>' |
./xl/sharedStrings.xml | Strings that appear in more sheets, references from the sheet by indexnumber. |
./xl/styles.xml | |
./xl/worksheets/sheet2.xml | Content of sheet2 |
./xl/worksheets/sheet1.xml | Content of sheet1 |
./xl/worksheets/sheet3.xml | Content of sheet3 |
./xl/_rels/workbook.xml.rels | |
./_rels/.rels |
Found this on stackoverflow. It is announced as very very basic but it works very well.
This routine returns a list of dicts like { <columname> : <cellvalue> [, .. ] } The original, with sharedStrings:
def xlsx(fname):
import zipfile
from xml.etree.ElementTree import iterparse
z = zipfile.ZipFile(fname)
# Get shared strings
strings = [ element.text
for event,element in iterparse(z.open('xl/sharedStrings.xml'))
if element.tag.endswith('}t') ]
# Get sheetnames and index
sheetdict = { element.attrib['name']:element.attrib['sheetId']
for event,element in iterparse(z.open('xl/workbook.xml'))
if element.tag.endswith('}sheet') }
rows = []
row = {}
value = ''
if sheet in sheetdict:
sheetfile = 'xl/worksheets/sheet'+sheetdict[sheet]+'.xml'
for event, element in iterparse(z.open(sheetfile)):
# get value or index to shared strings
if element.tag.endswith('}v'):
value = element.text
# If value is a shared string, use value as an index
if element.tag.endswith('}c'):
if element.attrib.get('t') == 's':
value = strings[int(value)]
# split the row/col information so that the row leter(s) can be separate
letter = element.attrib['r']
while letter[-1].isdigit():
letter = letter[:-1]
row[letter] = value
value = ''
if element.tag.endswith('}row'):
rows.append(row)
row = {}
return rows
Adapted for sheets without sharedStrings:
def xlsx(fname):
import zipfile
from xml.etree.ElementTree import iterparse
z = zipfile.ZipFile(fname)
# Get shared strings
#strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml'))
# if el.tag.endswith('}t')]
rows = []
row = {}
value = ''
for e, el in iterparse(z.open('xl/worksheets/sheet7.xml')):
# get value or index to shared strings
if el.tag.endswith('}v'): # <v>84</v>
value = el.text
if el.tag.endswith('}t'): # <t>String</t>
value = el.text
# If value is a shared string, use value as an index
if el.tag.endswith('}c'): # <c r="A3" t="s"><v>84</v></c>
if el.attrib.get('t') == 's':
value = strings[int(value)]
# split the row/col information so that the row leter(s) can be separate
letter = el.attrib['r'] # AZ22
while letter[-1].isdigit():
letter = letter[:-1]
row[letter] = value
value = ''
if el.tag.endswith('}row'):
rows.append(row)
row = {}
return rows