Difference between revisions of "Python:Files"

From wiki
Jump to navigation Jump to search
 
(43 intermediate revisions by the same user not shown)
Line 1: Line 1:
 
[[Category:Python]]
 
[[Category:Python]]
  
 +
=Basics=
 
;glob.glob(filespec)
 
;glob.glob(filespec)
:Return a list of files matching 'filespec'.
+
:Return a [[Python:DataTypes#list|list]] of files matching 'filespec'.
 
Code example:
 
Code example:
 
<syntaxhighlight lang=python>
 
<syntaxhighlight lang=python>
Line 9: Line 10:
 
</syntaxhighlight>
 
</syntaxhighlight>
  
;open (filename,"r")
+
;os.path.isfile(filename)
:open filname for read and return the filehandle. Use w for write.
+
:Boolean for file existence
 +
 
 +
;fh = open (filename,"r")
 +
:open filename for read and return the filehandle fh. Use w for write, a for append.
 +
 
 +
;fh.close()
 +
:Close the file for filehandle fh.
  
 
Code example:
 
Code example:
Line 17: Line 24:
 
if os.path.isfile(filename):
 
if os.path.isfile(filename):
 
     f1 =  open (filename,"r")
 
     f1 =  open (filename,"r")
 +
    for line in f1:
 +
        <codeblock>
 +
    f1.close()
 +
</syntaxhighlight>
 +
Or 'Easier to Ask for Forgiveness than Permission' (EAFP):
 +
<syntaxhighlight lang=python>
 +
try:
 +
    fh = open (filename,"r")
 +
except:
 +
    print('ERROR: {} cannot be opened'.format(filename))
 +
    logging.error('ERROR: {} cannot be opened'.format(filename))
 +
else:
 +
    <other code>
 
</syntaxhighlight>
 
</syntaxhighlight>
  
Line 30: Line 50:
  
 
;f1.read(size)
 
;f1.read(size)
:Return 'size' bytest from the file as string if size is omited the entire file is returned.
+
:Return 'size' bytes from the file as [[Python:Strings|string]]. If size is omitted or 0 the entire file is returned.
  
;f1.readlines(size)
+
;f1.readlines()
 
;list(f1)
 
;list(f1)
:Return ('size') lines from file as list.
+
:Return all lines from file as [[Python:DataTypes#list|list]].
  
Read through all files specified on the commandline.
+
;fileinput.input()
If there are no files on the commandline read standard input
+
:Read through all files specified on the commandline.
 +
:If there are no files on the commandline read standard input
 +
:You can pass other arguments too but you have to remove them from sys.argv before you start reading fileinput
 
<syntaxhighlight lang=python>
 
<syntaxhighlight lang=python>
 
import fileinput
 
import fileinput
 +
import sys
 +
 +
otherarg = sys.argv.pop()  # other argument is the last on the commandline
  
 
for line in fileinput.input():
 
for line in fileinput.input():
Line 45: Line 70:
 
</syntaxhighlight>
 
</syntaxhighlight>
  
==Read from standard input==
+
;f1.write(line)
 +
:Write line to file opened on filehandle f1
 +
 
 +
;sys.stdout.write(<string>)
 +
:Write to standard output
 +
 
 +
;basename = filepath.split('/')[-1]
 +
:Get the filename from a path
 +
 
 +
=Filehandling and metadata=
 +
;os.unlink(filename)
 +
:Remove file or symbolic link
 +
 
 +
;statinfo = os.stat(filename)
 +
:Get file metadata like:
 +
:<code>posix.stat_result(st_mode=33204, st_ino=3069488, st_dev=21L, st_nlink=1, st_uid=999, st_gid=999, st_size=37078, st_atime=4939053720, st_mtime=3939053719, st_ctime=2939053719)</code>
 +
:<code>statinfo.st_size</code> has the filesize in bytes.
 +
 
 +
;Walking a direcotry tree and fetching file information
 +
<syntaxhighlight lang=python>
 +
def do_dir(directory:
 +
    with os.scandir(directory) as it:
 +
        for entry in it:
 +
            if not entry.name.startswith('.'):
 +
                if entry.is_file():
 +
                    filepath = entry.path
 +
                    inode = entry.inode()
 +
                    ctime = entry.stat().st_ctime # see statinfo for other data
 +
                elif entry.is_dir():
 +
                    do_dir(entry)
 +
</syntaxhighlight>
 +
 
 +
=Archives=
 +
 
 +
==Read an archive==
 +
Read a file in a tar archive into a list of lines regardless the compression used (not zip).
 +
<syntaxhighlight lang=python>
 +
import tarfile
 +
tar = tarfile.open(<tarfile>,'r')
 +
for member in tar.getmembers():
 +
  print(member.name)
 +
  filelist = tar.extractfile(member)
 +
</syntaxhighlight>
 +
 
 +
==Copy files from 1 archive to another==
 +
<syntaxhighlight lang=python>
 +
#!/usr/bin/env python3
 +
import tarfile
 +
 
 +
filenames = {<(part of) filename to copy>, <(part of) filename to copy>}
 +
oldtar = tarfile.open('tar1.tar',"r")
 +
newtar = tarfile.open('tar2.tar',"w")
 +
for member in oldtar.getmembers():
 +
    done = 0
 +
    for filename in filenames:
 +
        if filename in member.name:
 +
            try:
 +
                newtar.addfile(member, oldtar.extractfile(member.name))
 +
                done = 1
 +
            except OSError as exception:
 +
                print(f"{member.name} has error {exception}")
 +
                done = 2
 +
 
 +
    if done == 1:
 +
        print(f"{member.name} Added")
 +
    elif done == 0:
 +
        print(f"{member.name} Skipped")
 +
newtar.close()
 +
oldtar.close()
 +
</syntaxhighlight>
 +
 
 +
=Zip files=
 +
Check [https://pymotw.com/2/zipfile/ this page].
 +
 
 +
==Read a zip-file==
 +
<syntaxhighlight lang=python>
 +
import zipfile
 +
 
 +
z = zipfile.ZipFile(zipile)
 +
for file in z.namelist():
 +
    print(file)
 +
 
 +
data = z.read(<zipped-filename>)   
 +
</syntaxhighlight>
 +
 
 +
==Create a zip-file==
 +
<syntaxhighlight lang=python>
 +
import zipfile,zlib
 +
 
 +
zipname = filename+'.zip'
 +
zfile = zipfile.ZipFile(zipname, mode='w')
 +
if zfile:
 +
    zfile.write(filename, compress_type=zipfile.ZIP_DEFLATED)
 +
</syntaxhighlight>
 +
 
 +
 
 +
=[[XLS|Excel files]]=
 +
==Reading==
 +
Excel-files is basically a zip-file with some specific content and they can be handled like that. [[Pandas]] has a build in ability to read excel into a dataframe, if possible use that.
 +
==Writing==
 +
Below writes a list of lists to excel
 +
<syntaxhighlight lang=python>
 +
import xlsxwriter
 +
 
 +
def main():
 +
    workbook = xlsxwriter.Workbook(excelfilename, {'nan_inf_to_errors': True})
  
 +
    header = ['Column1', 'Column1']
 +
 +
    writeworkbook(workbook, worksheetname, alist, header)
 +
 +
    workbook.close()
 +
    return
 +
 +
def writeworkbook(workbook, worksheetname, outlist, header):
 +
    worksheet = workbook.add_worksheet(worksheetname)
 +
    columnwidths = {}
 +
 +
    columnno = 0
 +
    for column in header:
 +
        columnwidths[columnno] = len(column)
 +
        columnno += 1
 +
 +
    for row in outlist:
 +
        columnno = 0
 +
        for column in row:
 +
            columnwidths[columnno] = max(columnwidths[columnno], len(str(column)))
 +
            columnno += 1
 +
 +
    for columnno in columnwidths:
 +
        worksheet.set_column(columnno, columnno, columnwidths[columnno] + columnwidths[columnno] * 0.1)
 +
 +
    wsindex = 0
 +
    worksheet.write_row(wsindex, 0, header)
 +
    for row in outlist:
 +
        wsindex += 1
 +
        worksheet.write_row(wsindex, 0, row)
 +
 +
    return
 +
 +
main()
 +
</syntaxhighlight>
 +
 +
=Read from standard input and keyboard=
  
 
Read from standard input
 
Read from standard input
Line 55: Line 222:
 
     <codeblock>
 
     <codeblock>
 
</syntaxhighlight>
 
</syntaxhighlight>
 +
 +
Prompt and read from keyboard into a
 +
<syntaxhighlight lang=python>
 +
a = input("Prompt: ")
 +
</syntaxhighlight>
 +
 +
In python2
 +
<syntaxhighlight lang=python>
 +
a = raw_input("Prompt: ")
 +
</syntaxhighlight>
 +
 +
=Read a csv=
 +
This code read all files matching the specification and return the content as a list of dicts that have the fieldnames as keys. Fieldnames must be on the first line of the file an must be unique.
 +
NOTE: This code cannot handle value's that contain the separator. The line will be split on all separator occurrences. Use [[Pandas]] or a specific csv-reader module if you need this.
 +
 +
<syntaxhighlight lang=python>
 +
def csv2dict(filespec, separator=','):
 +
    '''Convert a csv-file to a list of dicts'''
 +
    outfile = []
 +
    filedir = glob.glob(filespec)
 +
    for filename in filedir:
 +
        try:
 +
            fh = open(filename, "r")
 +
        except:
 +
            print('{} cannot be opened'.format(filename))
 +
        else:
 +
            filelist = [line.strip().split(separator) for line in fh]
 +
            fh.close()
 +
            header = filelist.pop(0)
 +
            fieldnames = set(header)
 +
            if len(header) != len(fieldnames):
 +
                print('ERROR: Fieldnames in {} are not unique'.format(filename))
 +
            else:
 +
                numfields = len(header)
 +
                linecount = 0
 +
                for line in filelist:
 +
                    linecount += 1
 +
                    linedict = {}
 +
                    count = 0
 +
                    for field in line:
 +
                        linedict[header[count]] = field
 +
                        count += 1
 +
                        if count > numfields - 1:
 +
                            break
 +
                    if count != numfields:
 +
                        print('ERROR: invalid number of fields in line ' + str(linecount))
 +
                    outfile.append(linedict) 
 +
    return (outfile)</syntaxhighlight>
 +
 +
=Read xml=
 +
Module and code examples [[Python:XML]]

Latest revision as of 17:29, 12 December 2023


Basics

glob.glob(filespec)
Return a list of files matching 'filespec'.

Code example:

import glob
files = glob.glob(filespec)
os.path.isfile(filename)
Boolean for file existence
fh = open (filename,"r")
open filename for read and return the filehandle fh. Use w for write, a for append.
fh.close()
Close the file for filehandle fh.

Code example:

import os
if os.path.isfile(filename):
    f1 =  open (filename,"r")
    for line in f1:
        <codeblock>
    f1.close()

Or 'Easier to Ask for Forgiveness than Permission' (EAFP):

try:
    fh = open (filename,"r")
except:
    print('ERROR: {} cannot be opened'.format(filename))
    logging.error('ERROR: {} cannot be opened'.format(filename))
else:
    <other code>
with open (filename,"r") as file
Open filename for read and close at the end of the loop

Code example:

with open (filename,"r") as file:
    for line in file:
        <codeblock>
f1.read(size)
Return 'size' bytes from the file as string. If size is omitted or 0 the entire file is returned.
f1.readlines()
list(f1)
Return all lines from file as list.
fileinput.input()
Read through all files specified on the commandline.
If there are no files on the commandline read standard input
You can pass other arguments too but you have to remove them from sys.argv before you start reading fileinput
import fileinput
import sys

otherarg = sys.argv.pop()  # other argument is the last on the commandline

for line in fileinput.input():
    <codeblock>
f1.write(line)
Write line to file opened on filehandle f1
sys.stdout.write(<string>)
Write to standard output
basename = filepath.split('/')[-1]
Get the filename from a path

Filehandling and metadata

os.unlink(filename)
Remove file or symbolic link
statinfo = os.stat(filename)
Get file metadata like:
posix.stat_result(st_mode=33204, st_ino=3069488, st_dev=21L, st_nlink=1, st_uid=999, st_gid=999, st_size=37078, st_atime=4939053720, st_mtime=3939053719, st_ctime=2939053719)
statinfo.st_size has the filesize in bytes.
Walking a direcotry tree and fetching file information
def do_dir(directory:
    with os.scandir(directory) as it:
        for entry in it:
            if not entry.name.startswith('.'):
                if entry.is_file():
                    filepath = entry.path
                    inode = entry.inode()
                    ctime = entry.stat().st_ctime # see statinfo for other data
                elif entry.is_dir():
                    do_dir(entry)

Archives

Read an archive

Read a file in a tar archive into a list of lines regardless the compression used (not zip).

import tarfile
tar = tarfile.open(<tarfile>,'r')
for member in tar.getmembers():
   print(member.name)
   filelist = tar.extractfile(member)

Copy files from 1 archive to another

#!/usr/bin/env python3
import tarfile

filenames = {<(part of) filename to copy>, <(part of) filename to copy>}
oldtar = tarfile.open('tar1.tar',"r")
newtar = tarfile.open('tar2.tar',"w")
for member in oldtar.getmembers():
    done = 0
    for filename in filenames:
        if filename in member.name:
            try:
                newtar.addfile(member, oldtar.extractfile(member.name))
                done = 1
            except OSError as exception:
                print(f"{member.name} has error {exception}")
                done = 2

    if done == 1:
        print(f"{member.name} Added")
    elif done == 0:
        print(f"{member.name} Skipped")
newtar.close()
oldtar.close()

Zip files

Check this page.

Read a zip-file

import zipfile

z = zipfile.ZipFile(zipile)
for file in z.namelist():
    print(file)

data = z.read(<zipped-filename>)

Create a zip-file

import zipfile,zlib

zipname = filename+'.zip'
zfile = zipfile.ZipFile(zipname, mode='w')
if zfile:
    zfile.write(filename, compress_type=zipfile.ZIP_DEFLATED)


Excel files

Reading

Excel-files is basically a zip-file with some specific content and they can be handled like that. Pandas has a build in ability to read excel into a dataframe, if possible use that.

Writing

Below writes a list of lists to excel

import xlsxwriter

def main():
    workbook = xlsxwriter.Workbook(excelfilename, {'nan_inf_to_errors': True})

    header = ['Column1', 'Column1']

    writeworkbook(workbook, worksheetname, alist, header)

    workbook.close()
    return

def writeworkbook(workbook, worksheetname, outlist, header):
    worksheet = workbook.add_worksheet(worksheetname)
    columnwidths = {}

    columnno = 0
    for column in header:
        columnwidths[columnno] = len(column)
        columnno += 1

    for row in outlist:
        columnno = 0
        for column in row:
            columnwidths[columnno] = max(columnwidths[columnno], len(str(column)))
            columnno += 1

    for columnno in columnwidths:
        worksheet.set_column(columnno, columnno, columnwidths[columnno] + columnwidths[columnno] * 0.1)

    wsindex = 0
    worksheet.write_row(wsindex, 0, header)
    for row in outlist:
        wsindex += 1
        worksheet.write_row(wsindex, 0, row)

    return

main()

Read from standard input and keyboard

Read from standard input

import sys

for line in sys.stdin:
    <codeblock>

Prompt and read from keyboard into a

a = input("Prompt: ")

In python2

a = raw_input("Prompt: ")

Read a csv

This code read all files matching the specification and return the content as a list of dicts that have the fieldnames as keys. Fieldnames must be on the first line of the file an must be unique. NOTE: This code cannot handle value's that contain the separator. The line will be split on all separator occurrences. Use Pandas or a specific csv-reader module if you need this.

def csv2dict(filespec, separator=','):
    '''Convert a csv-file to a list of dicts'''
    outfile = []
    filedir = glob.glob(filespec)
    for filename in filedir:
        try:
            fh = open(filename, "r")
        except:
            print('{} cannot be opened'.format(filename))
        else:
            filelist = [line.strip().split(separator) for line in fh]
            fh.close()
            header = filelist.pop(0)
            fieldnames = set(header)
            if len(header) != len(fieldnames):
                print('ERROR: Fieldnames in {} are not unique'.format(filename))
            else:
                numfields = len(header)
                linecount = 0
                for line in filelist:
                    linecount += 1
                    linedict = {}
                    count = 0
                    for field in line:
                        linedict[header[count]] = field
                        count += 1
                        if count > numfields - 1:
                            break
                    if count != numfields:
                        print('ERROR: invalid number of fields in line ' + str(linecount))
                    outfile.append(linedict)  
    return (outfile)

Read xml

Module and code examples Python:XML