Difference between revisions of "XLS"

From wiki
Jump to navigation Jump to search
Line 57: Line 57:
 
     z = zipfile.ZipFile(fname)
 
     z = zipfile.ZipFile(fname)
 
     # Get shared strings
 
     # Get shared strings
     strings = [el.text for e, el
+
     strings = [ element.text  
                      in iterparse(z.open('xl/sharedStrings.xml'))  
+
                  for event,element in iterparse(z.open('xl/sharedStrings.xml'))  
                      if el.tag.endswith('}t')]
+
                      if element.tag.endswith('}t') ]
 +
    # Get sheetnames and index
 +
    sheetdict = { element.attrib['name']:element.attrib['sheetId']
 +
                    for event,element in iterparse(z.open('xl/workbook.xml'))
 +
                        if element.tag.endswith('}sheet') }
 +
 
 
     rows = []
 
     rows = []
 
     row = {}
 
     row = {}
 
     value = ''
 
     value = ''
     for e, el in iterparse(z.open('xl/worksheets/sheet1.xml')):
+
 
        # get value or index to shared strings
+
     if sheet in sheetdict:
        if el.tag.endswith('}v'):                               # <v>84</v>
+
        sheetfile = 'xl/worksheets/sheet'+sheetdict[sheet]+'.xml'
            value = el.text
+
        for event, element in iterparse(z.open(sheetfile)):
        # If value is a shared string, use value as an index
+
            # get value or index to shared strings
        if el.tag.endswith('}c'):                              # <c r="A3" t="s"><v>84</v></c>
+
            if element.tag.endswith('}v'):                            
            if el.attrib.get('t') == 's':
+
                value = element.text
                value = strings[int(value)]
+
            # If value is a shared string, use value as an index
            # split the row/col information so that the row leter(s) can be separate
+
            if element.tag.endswith('}c'):                               
            letter = el.attrib['r']                             # AZ22
+
                if element.attrib.get('t') == 's':
            while letter[-1].isdigit():
+
                    value = strings[int(value)]
                letter = letter[:-1]
+
                # split the row/col information so that the row leter(s) can be separate
            row[letter] = value
+
                letter = element.attrib['r']
            value = ''
+
                while letter[-1].isdigit():
        if el.tag.endswith('}row'):
+
                    letter = letter[:-1]
            rows.append(row)
+
                row[letter] = value
            row = {}
+
                value = ''
 +
            if element.tag.endswith('}row'):
 +
                rows.append(row)
 +
                row = {}  
 
     return rows
 
     return rows
 
</syntaxhighlight>
 
</syntaxhighlight>

Revision as of 22:33, 27 October 2018


Python XLS parser using standard modules. xlsx files are basically just a set of compressed (zipped) .XML pages.

A simple workbook with 3 sheets having the same content like this:

Sample xls.png

Consists of the following .xml files:

Filename Remarks
./[Content_Types].xml
./docProps/core.xml
./docProps/app.xml
./xl/workbook.xml Holds per sheet a node like '<sheet name="Test2" sheetId="2" state="visible" r:id="rId3"/>'
./xl/sharedStrings.xml Strings that appear in more sheets, references from the sheet by indexnumber.
./xl/styles.xml
./xl/worksheets/sheet2.xml Content of sheet2
./xl/worksheets/sheet1.xml Content of sheet1
./xl/worksheets/sheet3.xml Content of sheet3
./xl/_rels/workbook.xml.rels
./_rels/.rels

Found this on stackoverflow. It is announced as very very basic but it works very well.

This routine returns a list of dicts like { <columname> : <cellvalue> [, .. ] } The original, with sharedStrings:

def xlsx(fname):
    import zipfile
    from xml.etree.ElementTree import iterparse
    z = zipfile.ZipFile(fname)
    # Get shared strings
    strings = [ element.text 
                  for event,element in iterparse(z.open('xl/sharedStrings.xml')) 
                      if element.tag.endswith('}t') ]
    # Get sheetnames and index
    sheetdict = { element.attrib['name']:element.attrib['sheetId'] 
                    for event,element in iterparse(z.open('xl/workbook.xml'))
                        if element.tag.endswith('}sheet') }

    rows = []
    row = {}
    value = ''

    if sheet in sheetdict:
        sheetfile = 'xl/worksheets/sheet'+sheetdict[sheet]+'.xml'
        for event, element in iterparse(z.open(sheetfile)):
            # get value or index to shared strings
            if element.tag.endswith('}v'):                              
                value = element.text
            # If value is a shared string, use value as an index
            if element.tag.endswith('}c'):                               
                if element.attrib.get('t') == 's':
                    value = strings[int(value)]
                # split the row/col information so that the row leter(s) can be separate
                letter = element.attrib['r']
                while letter[-1].isdigit():
                    letter = letter[:-1]
                row[letter] = value
                value = ''
            if element.tag.endswith('}row'):
                rows.append(row)
                row = {}    
    return rows

Adapted for sheets without sharedStrings:

def xlsx(fname):
    import zipfile
    from xml.etree.ElementTree import iterparse
    z = zipfile.ZipFile(fname)
    # Get shared strings
    #strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml'))
    #                   if el.tag.endswith('}t')]
    rows = []
    row = {}
    value = ''
    for e, el in iterparse(z.open('xl/worksheets/sheet7.xml')):
        # get value or index to shared strings
        if el.tag.endswith('}v'):                               # <v>84</v>
            value = el.text
        if el.tag.endswith('}t'):                               # <t>String</t>
            value = el.text
        # If value is a shared string, use value as an index
        if el.tag.endswith('}c'):                               # <c r="A3" t="s"><v>84</v></c>
            if el.attrib.get('t') == 's':
                value = strings[int(value)]
            # split the row/col information so that the row leter(s) can be separate
            letter = el.attrib['r']                             # AZ22
            while letter[-1].isdigit():
                letter = letter[:-1]
            row[letter] = value
            value = ''
        if el.tag.endswith('}row'):
            rows.append(row)
            row = {}
    return rows