Difference between revisions of "XLS"

From wiki
Jump to navigation Jump to search
m
Line 6: Line 6:
  
 
This routine returns a list of dicts like { <columname> : <cellvalue> [, .. ] }
 
This routine returns a list of dicts like { <columname> : <cellvalue> [, .. ] }
 +
The original, with sharedStrings:
 
<syntaxhighlight lang=python>
 
<syntaxhighlight lang=python>
 
def xlsx(fname):
 
def xlsx(fname):
Line 21: Line 22:
 
         # get value or index to shared strings
 
         # get value or index to shared strings
 
         if el.tag.endswith('}v'):                              # <v>84</v>
 
         if el.tag.endswith('}v'):                              # <v>84</v>
 +
            value = el.text
 +
        # If value is a shared string, use value as an index
 +
        if el.tag.endswith('}c'):                              # <c r="A3" t="s"><v>84</v></c>
 +
            if el.attrib.get('t') == 's':
 +
                value = strings[int(value)]
 +
            # split the row/col information so that the row leter(s) can be separate
 +
            letter = el.attrib['r']                            # AZ22
 +
            while letter[-1].isdigit():
 +
                letter = letter[:-1]
 +
            row[letter] = value
 +
            value = ''
 +
        if el.tag.endswith('}row'):
 +
            rows.append(row)
 +
            row = {}
 +
    return rows
 +
</syntaxhighlight>
 +
 +
Adapted for sheets without sharedStrings:
 +
<syntaxhighlight lang=python>
 +
def xlsx(fname):
 +
    import zipfile
 +
    from xml.etree.ElementTree import iterparse
 +
    z = zipfile.ZipFile(fname)
 +
    # Get shared strings
 +
    #strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml'))
 +
    #                  if el.tag.endswith('}t')]
 +
    rows = []
 +
    row = {}
 +
    value = ''
 +
    for e, el in iterparse(z.open('xl/worksheets/sheet7.xml')):
 +
        # get value or index to shared strings
 +
        if el.tag.endswith('}v'):                              # <v>84</v>
 +
            value = el.text
 +
        if el.tag.endswith('}t'):                              # <t>String</t>
 
             value = el.text
 
             value = el.text
 
         # If value is a shared string, use value as an index
 
         # If value is a shared string, use value as an index

Revision as of 10:24, 10 August 2018


Python XLS parser using standard modules.

Found on stackoverflow. It is announced as very very basic but it works very well.

This routine returns a list of dicts like { <columname> : <cellvalue> [, .. ] } The original, with sharedStrings:

def xlsx(fname):
    import zipfile
    from xml.etree.ElementTree import iterparse
    z = zipfile.ZipFile(fname)
    # Get shared strings
    strings = [el.text for e, el
                       in iterparse(z.open('xl/sharedStrings.xml')) 
                       if el.tag.endswith('}t')]
    rows = []
    row = {}
    value = ''
    for e, el in iterparse(z.open('xl/worksheets/sheet1.xml')):
        # get value or index to shared strings
        if el.tag.endswith('}v'):                               # <v>84</v>
            value = el.text
        # If value is a shared string, use value as an index
        if el.tag.endswith('}c'):                               # <c r="A3" t="s"><v>84</v></c>
            if el.attrib.get('t') == 's':
                value = strings[int(value)]
            # split the row/col information so that the row leter(s) can be separate
            letter = el.attrib['r']                             # AZ22
            while letter[-1].isdigit():
                letter = letter[:-1]
            row[letter] = value
            value = ''
        if el.tag.endswith('}row'):
            rows.append(row)
            row = {}
    return rows

Adapted for sheets without sharedStrings:

def xlsx(fname):
    import zipfile
    from xml.etree.ElementTree import iterparse
    z = zipfile.ZipFile(fname)
    # Get shared strings
    #strings = [el.text for e, el in iterparse(z.open('xl/sharedStrings.xml'))
    #                   if el.tag.endswith('}t')]
    rows = []
    row = {}
    value = ''
    for e, el in iterparse(z.open('xl/worksheets/sheet7.xml')):
        # get value or index to shared strings
        if el.tag.endswith('}v'):                               # <v>84</v>
            value = el.text
        if el.tag.endswith('}t'):                               # <t>String</t>
            value = el.text
        # If value is a shared string, use value as an index
        if el.tag.endswith('}c'):                               # <c r="A3" t="s"><v>84</v></c>
            if el.attrib.get('t') == 's':
                value = strings[int(value)]
            # split the row/col information so that the row leter(s) can be separate
            letter = el.attrib['r']                             # AZ22
            while letter[-1].isdigit():
                letter = letter[:-1]
            row[letter] = value
            value = ''
        if el.tag.endswith('}row'):
            rows.append(row)
            row = {}
    return rows