Difference between revisions of "XLS"
Line 47: | Line 47: | ||
|} | |} | ||
− | Found this on [https://stackoverflow.com/questions/4371163/reading-xlsx-files-using-python | + | Found this on [https://stackoverflow.com/ stackoverflow] where it has been removed [https://stackoverflow.com/questions/4371163/reading-xlsx-files-using-python]. It was announced as very very basic but it works very well and it only uses modules installed by default (at least in [https://www.ubuntu.com/ Ubuntu] 18.04). You can find the original solution at the end of this page. |
+ | |||
+ | The routine returns a list of dicts like { <columname> : <cellvalue> [, .. ] } | ||
+ | |||
+ | Our solution supporting workbooks with and without sharedstrings and fetches sheet content by the sheet name. Using 're' to find the column, this may be slower than the original approach but we have not found that in the test-files used. | ||
− | |||
− | |||
<syntaxhighlight lang=python> | <syntaxhighlight lang=python> | ||
− | def xlsx(fname): | + | def xlsx(fname,sheet): |
import zipfile | import zipfile | ||
from xml.etree.ElementTree import iterparse | from xml.etree.ElementTree import iterparse | ||
+ | import re | ||
z = zipfile.ZipFile(fname) | z = zipfile.ZipFile(fname) | ||
− | # Get shared strings | + | if 'xl/sharedStrings.xml' in z.namelist(): |
− | + | # Get shared strings | |
− | + | strings = [element.text for event, element | |
− | + | in iterparse(z.open('xl/sharedStrings.xml')) | |
− | + | if element.tag.endswith('}t')] | |
− | sheetdict = { element.attrib['name']:element.attrib['sheetId'] | + | sheetdict = { element.attrib['name']:element.attrib['sheetId'] for event,element in iterparse(z.open('xl/workbook.xml')) |
− | + | if element.tag.endswith('}sheet') } | |
− | |||
− | |||
rows = [] | rows = [] | ||
row = {} | row = {} | ||
Line 80: | Line 81: | ||
value = strings[int(value)] | value = strings[int(value)] | ||
# split the row/col information so that the row leter(s) can be separate | # split the row/col information so that the row leter(s) can be separate | ||
− | letter = element.attrib['r'] | + | letter = re.sub('\D','',element.attrib['r']) |
− | |||
− | |||
row[letter] = value | row[letter] = value | ||
value = '' | value = '' | ||
Line 91: | Line 90: | ||
</syntaxhighlight> | </syntaxhighlight> | ||
− | + | The original as found on stackoverflow: | |
<syntaxhighlight lang=python> | <syntaxhighlight lang=python> | ||
def xlsx(fname): | def xlsx(fname): | ||
Line 97: | Line 96: | ||
from xml.etree.ElementTree import iterparse | from xml.etree.ElementTree import iterparse | ||
z = zipfile.ZipFile(fname) | z = zipfile.ZipFile(fname) | ||
− | + | strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml')) if el.tag.endswith('}t')] | |
− | |||
− | |||
rows = [] | rows = [] | ||
row = {} | row = {} | ||
value = '' | value = '' | ||
− | for e, el in iterparse(z.open('xl/worksheets/ | + | for e, el in iterparse(z.open('xl/worksheets/sheet1.xml')): |
− | + | if el.tag.endswith('}v'): # <v>84</v> | |
− | if el.tag.endswith('}v'): | ||
− | |||
− | |||
value = el.text | value = el.text | ||
− | + | if el.tag.endswith('}c'): # <c r="A3" t="s"><v>84</v></c> | |
− | if el.tag.endswith('}c'): | ||
if el.attrib.get('t') == 's': | if el.attrib.get('t') == 's': | ||
value = strings[int(value)] | value = strings[int(value)] | ||
− | + | letter = el.attrib['r'] # AZ22 | |
− | letter = el.attrib['r'] | ||
while letter[-1].isdigit(): | while letter[-1].isdigit(): | ||
letter = letter[:-1] | letter = letter[:-1] |
Revision as of 13:05, 28 October 2018
Python XLS parser using standard modules. xlsx files are basically just a set of compressed (zipped) .XML pages.
A simple workbook with 3 sheets having the same content like this:
Consists of the following .xml files:
Filename | Remarks |
---|---|
./[Content_Types].xml | |
./docProps/core.xml | |
./docProps/app.xml | |
./xl/workbook.xml | Holds per sheet a node like '<sheet name="Test2" sheetId="2" state="visible" r:id="rId3"/>' |
./xl/sharedStrings.xml | Strings that appear in more sheets, references from the sheet by indexnumber. |
./xl/styles.xml | |
./xl/worksheets/sheet2.xml | Content of sheet2 |
./xl/worksheets/sheet1.xml | Content of sheet1 |
./xl/worksheets/sheet3.xml | Content of sheet3 |
./xl/_rels/workbook.xml.rels | |
./_rels/.rels |
Found this on stackoverflow where it has been removed [1]. It was announced as very very basic but it works very well and it only uses modules installed by default (at least in Ubuntu 18.04). You can find the original solution at the end of this page.
The routine returns a list of dicts like { <columname> : <cellvalue> [, .. ] }
Our solution supporting workbooks with and without sharedstrings and fetches sheet content by the sheet name. Using 're' to find the column, this may be slower than the original approach but we have not found that in the test-files used.
def xlsx(fname,sheet):
import zipfile
from xml.etree.ElementTree import iterparse
import re
z = zipfile.ZipFile(fname)
if 'xl/sharedStrings.xml' in z.namelist():
# Get shared strings
strings = [element.text for event, element
in iterparse(z.open('xl/sharedStrings.xml'))
if element.tag.endswith('}t')]
sheetdict = { element.attrib['name']:element.attrib['sheetId'] for event,element in iterparse(z.open('xl/workbook.xml'))
if element.tag.endswith('}sheet') }
rows = []
row = {}
value = ''
if sheet in sheetdict:
sheetfile = 'xl/worksheets/sheet'+sheetdict[sheet]+'.xml'
for event, element in iterparse(z.open(sheetfile)):
# get value or index to shared strings
if element.tag.endswith('}v'):
value = element.text
# If value is a shared string, use value as an index
if element.tag.endswith('}c'):
if element.attrib.get('t') == 's':
value = strings[int(value)]
# split the row/col information so that the row leter(s) can be separate
letter = re.sub('\D','',element.attrib['r'])
row[letter] = value
value = ''
if element.tag.endswith('}row'):
rows.append(row)
row = {}
return rows
The original as found on stackoverflow:
def xlsx(fname):
import zipfile
from xml.etree.ElementTree import iterparse
z = zipfile.ZipFile(fname)
strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml')) if el.tag.endswith('}t')]
rows = []
row = {}
value = ''
for e, el in iterparse(z.open('xl/worksheets/sheet1.xml')):
if el.tag.endswith('}v'): # <v>84</v>
value = el.text
if el.tag.endswith('}c'): # <c r="A3" t="s"><v>84</v></c>
if el.attrib.get('t') == 's':
value = strings[int(value)]
letter = el.attrib['r'] # AZ22
while letter[-1].isdigit():
letter = letter[:-1]
row[letter] = value
value = ''
if el.tag.endswith('}row'):
rows.append(row)
row = {}
return rows