Difference between revisions of "Python:Files"
(34 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
[[Category:Python]] | [[Category:Python]] | ||
+ | =Basics= | ||
;glob.glob(filespec) | ;glob.glob(filespec) | ||
:Return a [[Python:DataTypes#list|list]] of files matching 'filespec'. | :Return a [[Python:DataTypes#list|list]] of files matching 'filespec'. | ||
Line 9: | Line 10: | ||
</syntaxhighlight> | </syntaxhighlight> | ||
− | ;open (filename,"r") | + | ;os.path.isfile(filename) |
− | :open filename for read and return the filehandle. Use w for write. | + | :Boolean for file existence |
+ | |||
+ | ;fh = open (filename,"r") | ||
+ | :open filename for read and return the filehandle fh. Use w for write, a for append. | ||
+ | |||
+ | ;fh.close() | ||
+ | :Close the file for filehandle fh. | ||
Code example: | Code example: | ||
Line 19: | Line 26: | ||
for line in f1: | for line in f1: | ||
<codeblock> | <codeblock> | ||
− | f1.close | + | f1.close() |
+ | </syntaxhighlight> | ||
+ | Or 'Easier to Ask for Forgiveness than Permission' (EAFP): | ||
+ | <syntaxhighlight lang=python> | ||
+ | try: | ||
+ | fh = open (filename,"r") | ||
+ | except: | ||
+ | print('ERROR: {} cannot be opened'.format(filename)) | ||
+ | logging.error('ERROR: {} cannot be opened'.format(filename)) | ||
+ | else: | ||
+ | <other code> | ||
</syntaxhighlight> | </syntaxhighlight> | ||
Line 42: | Line 59: | ||
:Read through all files specified on the commandline. | :Read through all files specified on the commandline. | ||
:If there are no files on the commandline read standard input | :If there are no files on the commandline read standard input | ||
+ | :You can pass other arguments too but you have to remove them from sys.argv before you start reading fileinput | ||
<syntaxhighlight lang=python> | <syntaxhighlight lang=python> | ||
import fileinput | import fileinput | ||
+ | import sys | ||
+ | |||
+ | otherarg = sys.argv.pop() # other argument is the last on the commandline | ||
for line in fileinput.input(): | for line in fileinput.input(): | ||
Line 50: | Line 71: | ||
;f1.write(line) | ;f1.write(line) | ||
− | :Write line to file | + | :Write line to file opened on filehandle f1 |
+ | |||
+ | ;sys.stdout.write(<string>) | ||
+ | :Write to standard output | ||
+ | |||
+ | ;basename = filepath.split('/')[-1] | ||
+ | :Get the filename from a path | ||
+ | |||
+ | =Filehandling and metadata= | ||
+ | ;os.unlink(filename) | ||
+ | :Remove file or symbolic link | ||
+ | |||
+ | ;statinfo = os.stat(filename) | ||
+ | :Get file metadata like: | ||
+ | :<code>posix.stat_result(st_mode=33204, st_ino=3069488, st_dev=21L, st_nlink=1, st_uid=999, st_gid=999, st_size=37078, st_atime=4939053720, st_mtime=3939053719, st_ctime=2939053719)</code> | ||
+ | :<code>statinfo.st_size</code> has the filesize in bytes. | ||
+ | |||
+ | ;Walking a direcotry tree and fetching file information | ||
+ | <syntaxhighlight lang=python> | ||
+ | def do_dir(directory: | ||
+ | with os.scandir(directory) as it: | ||
+ | for entry in it: | ||
+ | if not entry.name.startswith('.'): | ||
+ | if entry.is_file(): | ||
+ | filepath = entry.path | ||
+ | inode = entry.inode() | ||
+ | ctime = entry.stat().st_ctime # see statinfo for other data | ||
+ | elif entry.is_dir(): | ||
+ | do_dir(entry) | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | =Archives= | ||
− | ==Read from standard input and keyboard | + | ==Read an archive== |
+ | Read a file in a tar archive into a list of lines regardless the compression used (not zip). | ||
+ | <syntaxhighlight lang=python> | ||
+ | import tarfile | ||
+ | tar = tarfile.open(<tarfile>,'r') | ||
+ | for member in tar.getmembers(): | ||
+ | print(member.name) | ||
+ | filelist = tar.extractfile(member) | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ==Copy files from 1 archive to another== | ||
+ | <syntaxhighlight lang=python> | ||
+ | #!/usr/bin/env python3 | ||
+ | import tarfile | ||
+ | |||
+ | filenames = {<(part of) filename to copy>, <(part of) filename to copy>} | ||
+ | oldtar = tarfile.open('tar1.tar',"r") | ||
+ | newtar = tarfile.open('tar2.tar',"w") | ||
+ | for member in oldtar.getmembers(): | ||
+ | done = 0 | ||
+ | for filename in filenames: | ||
+ | if filename in member.name: | ||
+ | try: | ||
+ | newtar.addfile(member, oldtar.extractfile(member.name)) | ||
+ | done = 1 | ||
+ | except OSError as exception: | ||
+ | print(f"{member.name} has error {exception}") | ||
+ | done = 2 | ||
+ | |||
+ | if done == 1: | ||
+ | print(f"{member.name} Added") | ||
+ | elif done == 0: | ||
+ | print(f"{member.name} Skipped") | ||
+ | newtar.close() | ||
+ | oldtar.close() | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | =Zip files= | ||
+ | Check [https://pymotw.com/2/zipfile/ this page]. | ||
+ | |||
+ | ==Read a zip-file== | ||
+ | <syntaxhighlight lang=python> | ||
+ | import zipfile | ||
+ | |||
+ | z = zipfile.ZipFile(zipile) | ||
+ | for file in z.namelist(): | ||
+ | print(file) | ||
+ | |||
+ | data = z.read(<zipped-filename>) | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | ==Create a zip-file== | ||
+ | <syntaxhighlight lang=python> | ||
+ | import zipfile,zlib | ||
+ | |||
+ | zipname = filename+'.zip' | ||
+ | zfile = zipfile.ZipFile(zipname, mode='w') | ||
+ | if zfile: | ||
+ | zfile.write(filename, compress_type=zipfile.ZIP_DEFLATED) | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | |||
+ | =[[XLS|Excel files]]= | ||
+ | ==Reading== | ||
+ | Excel-files is basically a zip-file with some specific content and they can be handled like that. [[Pandas]] has a build in ability to read excel into a dataframe, if possible use that. | ||
+ | ==Writing== | ||
+ | Below writes a list of lists to excel | ||
+ | <syntaxhighlight lang=python> | ||
+ | import xlsxwriter | ||
+ | |||
+ | def main(): | ||
+ | workbook = xlsxwriter.Workbook(excelfilename, {'nan_inf_to_errors': True}) | ||
+ | |||
+ | header = ['Column1', 'Column1'] | ||
+ | |||
+ | writeworkbook(workbook, worksheetname, alist, header) | ||
+ | |||
+ | workbook.close() | ||
+ | return | ||
+ | |||
+ | def writeworkbook(workbook, worksheetname, outlist, header): | ||
+ | worksheet = workbook.add_worksheet(worksheetname) | ||
+ | columnwidths = {} | ||
+ | |||
+ | columnno = 0 | ||
+ | for column in header: | ||
+ | columnwidths[columnno] = len(column) | ||
+ | columnno += 1 | ||
+ | |||
+ | for row in outlist: | ||
+ | columnno = 0 | ||
+ | for column in row: | ||
+ | columnwidths[columnno] = max(columnwidths[columnno], len(str(column))) | ||
+ | columnno += 1 | ||
+ | |||
+ | for columnno in columnwidths: | ||
+ | worksheet.set_column(columnno, columnno, columnwidths[columnno] + columnwidths[columnno] * 0.1) | ||
+ | |||
+ | wsindex = 0 | ||
+ | worksheet.write_row(wsindex, 0, header) | ||
+ | for row in outlist: | ||
+ | wsindex += 1 | ||
+ | worksheet.write_row(wsindex, 0, row) | ||
+ | |||
+ | return | ||
+ | |||
+ | main() | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | =Read from standard input and keyboard= | ||
Read from standard input | Read from standard input | ||
Line 71: | Line 232: | ||
a = raw_input("Prompt: ") | a = raw_input("Prompt: ") | ||
</syntaxhighlight> | </syntaxhighlight> | ||
+ | |||
+ | =Read a csv= | ||
+ | This code read all files matching the specification and return the content as a list of dicts that have the fieldnames as keys. Fieldnames must be on the first line of the file an must be unique. | ||
+ | NOTE: This code cannot handle value's that contain the separator. The line will be split on all separator occurrences. Use [[Pandas]] or a specific csv-reader module if you need this. | ||
+ | |||
+ | <syntaxhighlight lang=python> | ||
+ | def csv2dict(filespec, separator=','): | ||
+ | '''Convert a csv-file to a list of dicts''' | ||
+ | outfile = [] | ||
+ | filedir = glob.glob(filespec) | ||
+ | for filename in filedir: | ||
+ | try: | ||
+ | fh = open(filename, "r") | ||
+ | except: | ||
+ | print('{} cannot be opened'.format(filename)) | ||
+ | else: | ||
+ | filelist = [line.strip().split(separator) for line in fh] | ||
+ | fh.close() | ||
+ | header = filelist.pop(0) | ||
+ | fieldnames = set(header) | ||
+ | if len(header) != len(fieldnames): | ||
+ | print('ERROR: Fieldnames in {} are not unique'.format(filename)) | ||
+ | else: | ||
+ | numfields = len(header) | ||
+ | linecount = 0 | ||
+ | for line in filelist: | ||
+ | linecount += 1 | ||
+ | linedict = {} | ||
+ | count = 0 | ||
+ | for field in line: | ||
+ | linedict[header[count]] = field | ||
+ | count += 1 | ||
+ | if count > numfields - 1: | ||
+ | break | ||
+ | if count != numfields: | ||
+ | print('ERROR: invalid number of fields in line ' + str(linecount)) | ||
+ | outfile.append(linedict) | ||
+ | return (outfile)</syntaxhighlight> | ||
+ | |||
+ | =Read xml= | ||
+ | Module and code examples [[Python:XML]] |
Latest revision as of 17:29, 12 December 2023
Basics
- glob.glob(filespec)
- Return a list of files matching 'filespec'.
Code example:
import glob
files = glob.glob(filespec)
- os.path.isfile(filename)
- Boolean for file existence
- fh = open (filename,"r")
- open filename for read and return the filehandle fh. Use w for write, a for append.
- fh.close()
- Close the file for filehandle fh.
Code example:
import os
if os.path.isfile(filename):
f1 = open (filename,"r")
for line in f1:
<codeblock>
f1.close()
Or 'Easier to Ask for Forgiveness than Permission' (EAFP):
try:
fh = open (filename,"r")
except:
print('ERROR: {} cannot be opened'.format(filename))
logging.error('ERROR: {} cannot be opened'.format(filename))
else:
<other code>
- with open (filename,"r") as file
- Open filename for read and close at the end of the loop
Code example:
with open (filename,"r") as file:
for line in file:
<codeblock>
- f1.read(size)
- Return 'size' bytes from the file as string. If size is omitted or 0 the entire file is returned.
- f1.readlines()
- list(f1)
- Return all lines from file as list.
- fileinput.input()
- Read through all files specified on the commandline.
- If there are no files on the commandline read standard input
- You can pass other arguments too but you have to remove them from sys.argv before you start reading fileinput
import fileinput
import sys
otherarg = sys.argv.pop() # other argument is the last on the commandline
for line in fileinput.input():
<codeblock>
- f1.write(line)
- Write line to file opened on filehandle f1
- sys.stdout.write(<string>)
- Write to standard output
- basename = filepath.split('/')[-1]
- Get the filename from a path
Filehandling and metadata
- os.unlink(filename)
- Remove file or symbolic link
- statinfo = os.stat(filename)
- Get file metadata like:
posix.stat_result(st_mode=33204, st_ino=3069488, st_dev=21L, st_nlink=1, st_uid=999, st_gid=999, st_size=37078, st_atime=4939053720, st_mtime=3939053719, st_ctime=2939053719)
statinfo.st_size
has the filesize in bytes.
- Walking a direcotry tree and fetching file information
def do_dir(directory:
with os.scandir(directory) as it:
for entry in it:
if not entry.name.startswith('.'):
if entry.is_file():
filepath = entry.path
inode = entry.inode()
ctime = entry.stat().st_ctime # see statinfo for other data
elif entry.is_dir():
do_dir(entry)
Archives
Read an archive
Read a file in a tar archive into a list of lines regardless the compression used (not zip).
import tarfile
tar = tarfile.open(<tarfile>,'r')
for member in tar.getmembers():
print(member.name)
filelist = tar.extractfile(member)
Copy files from 1 archive to another
#!/usr/bin/env python3
import tarfile
filenames = {<(part of) filename to copy>, <(part of) filename to copy>}
oldtar = tarfile.open('tar1.tar',"r")
newtar = tarfile.open('tar2.tar',"w")
for member in oldtar.getmembers():
done = 0
for filename in filenames:
if filename in member.name:
try:
newtar.addfile(member, oldtar.extractfile(member.name))
done = 1
except OSError as exception:
print(f"{member.name} has error {exception}")
done = 2
if done == 1:
print(f"{member.name} Added")
elif done == 0:
print(f"{member.name} Skipped")
newtar.close()
oldtar.close()
Zip files
Check this page.
Read a zip-file
import zipfile
z = zipfile.ZipFile(zipile)
for file in z.namelist():
print(file)
data = z.read(<zipped-filename>)
Create a zip-file
import zipfile,zlib
zipname = filename+'.zip'
zfile = zipfile.ZipFile(zipname, mode='w')
if zfile:
zfile.write(filename, compress_type=zipfile.ZIP_DEFLATED)
Excel files
Reading
Excel-files is basically a zip-file with some specific content and they can be handled like that. Pandas has a build in ability to read excel into a dataframe, if possible use that.
Writing
Below writes a list of lists to excel
import xlsxwriter
def main():
workbook = xlsxwriter.Workbook(excelfilename, {'nan_inf_to_errors': True})
header = ['Column1', 'Column1']
writeworkbook(workbook, worksheetname, alist, header)
workbook.close()
return
def writeworkbook(workbook, worksheetname, outlist, header):
worksheet = workbook.add_worksheet(worksheetname)
columnwidths = {}
columnno = 0
for column in header:
columnwidths[columnno] = len(column)
columnno += 1
for row in outlist:
columnno = 0
for column in row:
columnwidths[columnno] = max(columnwidths[columnno], len(str(column)))
columnno += 1
for columnno in columnwidths:
worksheet.set_column(columnno, columnno, columnwidths[columnno] + columnwidths[columnno] * 0.1)
wsindex = 0
worksheet.write_row(wsindex, 0, header)
for row in outlist:
wsindex += 1
worksheet.write_row(wsindex, 0, row)
return
main()
Read from standard input and keyboard
Read from standard input
import sys
for line in sys.stdin:
<codeblock>
Prompt and read from keyboard into a
a = input("Prompt: ")
In python2
a = raw_input("Prompt: ")
Read a csv
This code read all files matching the specification and return the content as a list of dicts that have the fieldnames as keys. Fieldnames must be on the first line of the file an must be unique. NOTE: This code cannot handle value's that contain the separator. The line will be split on all separator occurrences. Use Pandas or a specific csv-reader module if you need this.
def csv2dict(filespec, separator=','):
'''Convert a csv-file to a list of dicts'''
outfile = []
filedir = glob.glob(filespec)
for filename in filedir:
try:
fh = open(filename, "r")
except:
print('{} cannot be opened'.format(filename))
else:
filelist = [line.strip().split(separator) for line in fh]
fh.close()
header = filelist.pop(0)
fieldnames = set(header)
if len(header) != len(fieldnames):
print('ERROR: Fieldnames in {} are not unique'.format(filename))
else:
numfields = len(header)
linecount = 0
for line in filelist:
linecount += 1
linedict = {}
count = 0
for field in line:
linedict[header[count]] = field
count += 1
if count > numfields - 1:
break
if count != numfields:
print('ERROR: invalid number of fields in line ' + str(linecount))
outfile.append(linedict)
return (outfile)
Read xml
Module and code examples Python:XML