Difference between revisions of "Python:Files"

From wiki
Jump to navigation Jump to search
 
(21 intermediate revisions by the same user not shown)
Line 9: Line 9:
 
files = glob.glob(filespec)
 
files = glob.glob(filespec)
 
</syntaxhighlight>
 
</syntaxhighlight>
 +
 +
;os.path.isfile(filename)
 +
:Boolean for file existence
  
 
;fh = open (filename,"r")
 
;fh = open (filename,"r")
Line 25: Line 28:
 
     f1.close()
 
     f1.close()
 
</syntaxhighlight>
 
</syntaxhighlight>
 
+
Or 'Easier to Ask for Forgiveness than Permission' (EAFP):
;basename = filepath.split('/')[-1]
+
<syntaxhighlight lang=python>
:Get the filename from a path
+
try:
 
+
    fh = open (filename,"r")
;statinfo = os.stat(filename)
+
except:
:Get file metadata like:
+
    print('ERROR: {} cannot be opened'.format(filename))
:<code>posix.stat_result(st_mode=33204, st_ino=3069488, st_dev=21L, st_nlink=1, st_uid=999, st_gid=999, st_size=37078, st_atime=4939053720, st_mtime=3939053719, st_ctime=2939053719)</code>
+
    logging.error('ERROR: {} cannot be opened'.format(filename))
:<code>statinfo.st_size</code> has the filesize in bytes.
+
else:
 +
    <other code>
 +
</syntaxhighlight>
  
 
;with open (filename,"r") as file
 
;with open (filename,"r") as file
Line 54: Line 59:
 
:Read through all files specified on the commandline.
 
:Read through all files specified on the commandline.
 
:If there are no files on the commandline read standard input
 
:If there are no files on the commandline read standard input
 +
:You can pass other arguments too but you have to remove them from sys.argv before you start reading fileinput
 
<syntaxhighlight lang=python>
 
<syntaxhighlight lang=python>
 
import fileinput
 
import fileinput
 +
import sys
 +
 +
otherarg = sys.argv.pop()  # other argument is the last on the commandline
  
 
for line in fileinput.input():
 
for line in fileinput.input():
Line 67: Line 76:
 
:Write to standard output
 
:Write to standard output
  
=Zip a file=
+
;basename = filepath.split('/')[-1]
 +
:Get the filename from a path
 +
 
 +
=Filehandling and metadata=
 +
;os.unlink(filename)
 +
:Remove file or symbolic link
 +
 
 +
;statinfo = os.stat(filename)
 +
:Get file metadata like:
 +
:<code>posix.stat_result(st_mode=33204, st_ino=3069488, st_dev=21L, st_nlink=1, st_uid=999, st_gid=999, st_size=37078, st_atime=4939053720, st_mtime=3939053719, st_ctime=2939053719)</code>
 +
:<code>statinfo.st_size</code> has the filesize in bytes.
 +
 
 +
;Walking a direcotry tree and fetching file information
 +
<syntaxhighlight lang=python>
 +
def do_dir(directory:
 +
    with os.scandir(directory) as it:
 +
        for entry in it:
 +
            if not entry.name.startswith('.'):
 +
                if entry.is_file():
 +
                    filepath = entry.path
 +
                    inode = entry.inode()
 +
                    ctime = entry.stat().st_ctime # see statinfo for other data
 +
                elif entry.is_dir():
 +
                    do_dir(entry)
 +
</syntaxhighlight>
 +
 
 +
=Archives=
 +
 
 +
==Read an archive==
 +
Read a file in a tar archive into a list of lines regardless the compression used (not zip).
 +
<syntaxhighlight lang=python>
 +
import tarfile
 +
tar = tarfile.open(<tarfile>,'r')
 +
for member in tar.getmembers():
 +
  print(member.name)
 +
  filelist = tar.extractfile(member)
 +
</syntaxhighlight>
 +
 
 +
==Copy files from 1 archive to another==
 +
<syntaxhighlight lang=python>
 +
#!/usr/bin/env python3
 +
import tarfile
 +
 
 +
filenames = {<(part of) filename to copy>, <(part of) filename to copy>}
 +
oldtar = tarfile.open('tar1.tar',"r")
 +
newtar = tarfile.open('tar2.tar',"w")
 +
for member in oldtar.getmembers():
 +
    done = 0
 +
    for filename in filenames:
 +
        if filename in member.name:
 +
            try:
 +
                newtar.addfile(member, oldtar.extractfile(member.name))
 +
                done = 1
 +
            except OSError as exception:
 +
                print(f"{member.name} has error {exception}")
 +
                done = 2
 +
 
 +
    if done == 1:
 +
        print(f"{member.name} Added")
 +
    elif done == 0:
 +
        print(f"{member.name} Skipped")
 +
newtar.close()
 +
oldtar.close()
 +
</syntaxhighlight>
 +
 
 +
=Zip files=
 
Check [https://pymotw.com/2/zipfile/ this page].
 
Check [https://pymotw.com/2/zipfile/ this page].
  
 +
==Read a zip-file==
 +
<syntaxhighlight lang=python>
 +
import zipfile
 +
 +
z = zipfile.ZipFile(zipile)
 +
for file in z.namelist():
 +
    print(file)
 +
 +
data = z.read(<zipped-filename>)   
 +
</syntaxhighlight>
 +
 +
==Create a zip-file==
 
<syntaxhighlight lang=python>
 
<syntaxhighlight lang=python>
 
import zipfile,zlib
 
import zipfile,zlib
Line 77: Line 163:
 
if zfile:
 
if zfile:
 
     zfile.write(filename, compress_type=zipfile.ZIP_DEFLATED)
 
     zfile.write(filename, compress_type=zipfile.ZIP_DEFLATED)
 +
</syntaxhighlight>
 +
 +
 +
=[[XLS|Excel files]]=
 +
==Reading==
 +
Excel-files is basically a zip-file with some specific content and they can be handled like that. [[Pandas]] has a build in ability to read excel into a dataframe, if possible use that.
 +
==Writing==
 +
Below writes a list of lists to excel
 +
<syntaxhighlight lang=python>
 +
import xlsxwriter
 +
 +
def main():
 +
    workbook = xlsxwriter.Workbook(excelfilename, {'nan_inf_to_errors': True})
 +
 +
    header = ['Column1', 'Column1']
 +
 +
    writeworkbook(workbook, worksheetname, alist, header)
 +
 +
    workbook.close()
 +
    return
 +
 +
def writeworkbook(workbook, worksheetname, outlist, header):
 +
    worksheet = workbook.add_worksheet(worksheetname)
 +
    columnwidths = {}
 +
 +
    columnno = 0
 +
    for column in header:
 +
        columnwidths[columnno] = len(column)
 +
        columnno += 1
 +
 +
    for row in outlist:
 +
        columnno = 0
 +
        for column in row:
 +
            columnwidths[columnno] = max(columnwidths[columnno], len(str(column)))
 +
            columnno += 1
 +
 +
    for columnno in columnwidths:
 +
        worksheet.set_column(columnno, columnno, columnwidths[columnno] + columnwidths[columnno] * 0.1)
 +
 +
    wsindex = 0
 +
    worksheet.write_row(wsindex, 0, header)
 +
    for row in outlist:
 +
        wsindex += 1
 +
        worksheet.write_row(wsindex, 0, row)
 +
 +
    return
 +
 +
main()
 
</syntaxhighlight>
 
</syntaxhighlight>
  
Line 101: Line 235:
 
=Read a csv=
 
=Read a csv=
 
This code read all files matching the specification and return the content as a list of dicts that have the fieldnames as keys. Fieldnames must be on the first line of the file an must be unique.
 
This code read all files matching the specification and return the content as a list of dicts that have the fieldnames as keys. Fieldnames must be on the first line of the file an must be unique.
 +
NOTE: This code cannot handle value's that contain the separator. The line will be split on all separator occurrences. Use [[Pandas]] or a specific csv-reader module if you need this.
  
 
<syntaxhighlight lang=python>
 
<syntaxhighlight lang=python>
def csv2dict(filespec,seperator=','):
+
def csv2dict(filespec, separator=','):
 +
    '''Convert a csv-file to a list of dicts'''
 
     outfile = []
 
     outfile = []
 
     filedir = glob.glob(filespec)
 
     filedir = glob.glob(filespec)
 
     for filename in filedir:
 
     for filename in filedir:
         if os.path.isfile(filename):
+
         try:
             fh = open (filename,"r")
+
             fh = open(filename, "r")
             filelist = list(fh)
+
        except:
 +
            print('{} cannot be opened'.format(filename))
 +
        else:
 +
             filelist = [line.strip().split(separator) for line in fh]
 
             fh.close()
 
             fh.close()
             line = filelist.pop(0).rstrip('\r\n')
+
             header = filelist.pop(0)
             fieldnames = {}
+
             fieldnames = set(header)
             header = line.split(seperator)
+
             if len(header) != len(fieldnames):
            for field in header:
+
                 print('ERROR: Fieldnames in {} are not unique'.format(filename))
                 print('Fieldname = '+field)
+
             else:
                if field in fieldnames:
+
                numfields = len(header)
                    print('ERROR: Fieldnames are not unique')
+
                linecount = 0
             numfields = len(header)
+
                for line in filelist:
            linecount = 0
+
                    linecount += 1
            for line in filelist:
+
                    linedict = {}
                linecount += 1
+
                    count = 0
                line = line.rstrip('\r\n')
+
                    for field in line:
                fields = line.split(seperator)
+
                        linedict[header[count]] = field
                linedict = {}
+
                        count += 1
                count = 0
+
                        if count > numfields - 1:
                for field in fields:  
+
                            break
                    linedict[header[count]] = field
+
                    if count != numfields:
                    count += 1
+
                         print('ERROR: invalid number of fields in line ' + str(linecount))
                    if count > numfields-1:
+
                    outfile.append(linedict)
                        break
+
     return (outfile)</syntaxhighlight>
                if count != numfields:
+
 
                         print('ERROR: invalid number of fields in line '+str(linecount))                  
+
=Read xml=
                outfile.append(linedict)
+
Module and code examples [[Python:XML]]
     return(outfile)
 
</syntaxhighlight>
 

Latest revision as of 17:29, 12 December 2023


Basics

glob.glob(filespec)
Return a list of files matching 'filespec'.

Code example:

import glob
files = glob.glob(filespec)
os.path.isfile(filename)
Boolean for file existence
fh = open (filename,"r")
open filename for read and return the filehandle fh. Use w for write, a for append.
fh.close()
Close the file for filehandle fh.

Code example:

import os
if os.path.isfile(filename):
    f1 =  open (filename,"r")
    for line in f1:
        <codeblock>
    f1.close()

Or 'Easier to Ask for Forgiveness than Permission' (EAFP):

try:
    fh = open (filename,"r")
except:
    print('ERROR: {} cannot be opened'.format(filename))
    logging.error('ERROR: {} cannot be opened'.format(filename))
else:
    <other code>
with open (filename,"r") as file
Open filename for read and close at the end of the loop

Code example:

with open (filename,"r") as file:
    for line in file:
        <codeblock>
f1.read(size)
Return 'size' bytes from the file as string. If size is omitted or 0 the entire file is returned.
f1.readlines()
list(f1)
Return all lines from file as list.
fileinput.input()
Read through all files specified on the commandline.
If there are no files on the commandline read standard input
You can pass other arguments too but you have to remove them from sys.argv before you start reading fileinput
import fileinput
import sys

otherarg = sys.argv.pop()  # other argument is the last on the commandline

for line in fileinput.input():
    <codeblock>
f1.write(line)
Write line to file opened on filehandle f1
sys.stdout.write(<string>)
Write to standard output
basename = filepath.split('/')[-1]
Get the filename from a path

Filehandling and metadata

os.unlink(filename)
Remove file or symbolic link
statinfo = os.stat(filename)
Get file metadata like:
posix.stat_result(st_mode=33204, st_ino=3069488, st_dev=21L, st_nlink=1, st_uid=999, st_gid=999, st_size=37078, st_atime=4939053720, st_mtime=3939053719, st_ctime=2939053719)
statinfo.st_size has the filesize in bytes.
Walking a direcotry tree and fetching file information
def do_dir(directory:
    with os.scandir(directory) as it:
        for entry in it:
            if not entry.name.startswith('.'):
                if entry.is_file():
                    filepath = entry.path
                    inode = entry.inode()
                    ctime = entry.stat().st_ctime # see statinfo for other data
                elif entry.is_dir():
                    do_dir(entry)

Archives

Read an archive

Read a file in a tar archive into a list of lines regardless the compression used (not zip).

import tarfile
tar = tarfile.open(<tarfile>,'r')
for member in tar.getmembers():
   print(member.name)
   filelist = tar.extractfile(member)

Copy files from 1 archive to another

#!/usr/bin/env python3
import tarfile

filenames = {<(part of) filename to copy>, <(part of) filename to copy>}
oldtar = tarfile.open('tar1.tar',"r")
newtar = tarfile.open('tar2.tar',"w")
for member in oldtar.getmembers():
    done = 0
    for filename in filenames:
        if filename in member.name:
            try:
                newtar.addfile(member, oldtar.extractfile(member.name))
                done = 1
            except OSError as exception:
                print(f"{member.name} has error {exception}")
                done = 2

    if done == 1:
        print(f"{member.name} Added")
    elif done == 0:
        print(f"{member.name} Skipped")
newtar.close()
oldtar.close()

Zip files

Check this page.

Read a zip-file

import zipfile

z = zipfile.ZipFile(zipile)
for file in z.namelist():
    print(file)

data = z.read(<zipped-filename>)

Create a zip-file

import zipfile,zlib

zipname = filename+'.zip'
zfile = zipfile.ZipFile(zipname, mode='w')
if zfile:
    zfile.write(filename, compress_type=zipfile.ZIP_DEFLATED)


Excel files

Reading

Excel-files is basically a zip-file with some specific content and they can be handled like that. Pandas has a build in ability to read excel into a dataframe, if possible use that.

Writing

Below writes a list of lists to excel

import xlsxwriter

def main():
    workbook = xlsxwriter.Workbook(excelfilename, {'nan_inf_to_errors': True})

    header = ['Column1', 'Column1']

    writeworkbook(workbook, worksheetname, alist, header)

    workbook.close()
    return

def writeworkbook(workbook, worksheetname, outlist, header):
    worksheet = workbook.add_worksheet(worksheetname)
    columnwidths = {}

    columnno = 0
    for column in header:
        columnwidths[columnno] = len(column)
        columnno += 1

    for row in outlist:
        columnno = 0
        for column in row:
            columnwidths[columnno] = max(columnwidths[columnno], len(str(column)))
            columnno += 1

    for columnno in columnwidths:
        worksheet.set_column(columnno, columnno, columnwidths[columnno] + columnwidths[columnno] * 0.1)

    wsindex = 0
    worksheet.write_row(wsindex, 0, header)
    for row in outlist:
        wsindex += 1
        worksheet.write_row(wsindex, 0, row)

    return

main()

Read from standard input and keyboard

Read from standard input

import sys

for line in sys.stdin:
    <codeblock>

Prompt and read from keyboard into a

a = input("Prompt: ")

In python2

a = raw_input("Prompt: ")

Read a csv

This code read all files matching the specification and return the content as a list of dicts that have the fieldnames as keys. Fieldnames must be on the first line of the file an must be unique. NOTE: This code cannot handle value's that contain the separator. The line will be split on all separator occurrences. Use Pandas or a specific csv-reader module if you need this.

def csv2dict(filespec, separator=','):
    '''Convert a csv-file to a list of dicts'''
    outfile = []
    filedir = glob.glob(filespec)
    for filename in filedir:
        try:
            fh = open(filename, "r")
        except:
            print('{} cannot be opened'.format(filename))
        else:
            filelist = [line.strip().split(separator) for line in fh]
            fh.close()
            header = filelist.pop(0)
            fieldnames = set(header)
            if len(header) != len(fieldnames):
                print('ERROR: Fieldnames in {} are not unique'.format(filename))
            else:
                numfields = len(header)
                linecount = 0
                for line in filelist:
                    linecount += 1
                    linedict = {}
                    count = 0
                    for field in line:
                        linedict[header[count]] = field
                        count += 1
                        if count > numfields - 1:
                            break
                    if count != numfields:
                        print('ERROR: invalid number of fields in line ' + str(linecount))
                    outfile.append(linedict)  
    return (outfile)

Read xml

Module and code examples Python:XML