#!/usr/bin/python # This simple script will walk a directory tree, getting info # on every file encountered and making an entry into a SQLite # database. The database can then be probed to find information # on the files in the tree. # # To use it, just pass it the top-level directory where you want # it to start the walk. # # e.g. # # map_disk.py /work/halld # # # A companion script, "map_disk_report.py" will print a brief # summary of the usage. The script "map_disk.php" can be used # to generate a simple webpage using the resulting map_disk.db # file. # # NOTE: This will get the size of the contents of the files, # but NOT the disk usage. The usage will depend on the block size # so if there are lots of small files, the numbers reported # here may significantly underestimate the disk usage! # import sqlite3 as lite import sys import os import getpass import socket from os.path import join, getsize, getatime, getmtime, getctime from os import stat from pwd import getpwuid from time import gmtime, localtime, strftime if len(sys.argv) != 2: print "You must supply the top-level directory!" sys.exit(0); topdir = os.path.abspath(sys.argv[1]) # Connect to SQLite DB file, deleting any existing one first if(os.path.exists('map_disk.db')): os.remove('map_disk.db') con = lite.connect('map_disk.db') with con: # Specify that next cursor should be "dictionary" # (i.e. python's hash map) so columns can be indexed # by name con.row_factory = lite.Row # Tell SQLite to use 8-bit strings instead of unicode strings to # avoid error messages complaining about such stuff (arggh!) con.text_factory = str # Create Cursor cur = con.cursor() # Create tables to hold file info cur.execute("CREATE TABLE Files(fullpath TEXT, name TEXT, root TEXT, owner TEXT, size INT, atime INT, mtime INT, ctime INT)") cur.execute("CREATE TABLE Dirs(fullpath TEXT, name TEXT, root TEXT, owner TEXT, size INT, atime INT, mtime INT, ctime INT)") cur.execute("CREATE TABLE Info(key TEXT, val TEXT)") cur.execute("CREATE TABLE Users(owner TEXT, tot_size INT)") # Record start time of scan and other misc info cur.execute("INSERT INTO Info VALUES('user account', ?)", (getpass.getuser(),)) cur.execute("INSERT INTO Info VALUES('scan host', ?)", (socket.gethostname(),)) cur.execute("INSERT INTO Info VALUES('topdir', ?)", (topdir,)) cur.execute("INSERT INTO Info VALUES('scan start time', ?)", (strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()),)) # Initial root directory entry fp = topdir owner = getpwuid(stat(fp).st_uid).pw_name size = 0 atime = getatime(fp) mtime = getmtime(fp) ctime = getctime(fp) query = "INSERT INTO Dirs VALUES(?, ?, ?, ?, %d, %d, %d, %d)" % (size,atime,mtime,ctime) cur.execute(query, (fp,'.',topdir,owner)) # Walk directory tree, filling DB print "Mapping directory: ", topdir for root, dirs, files in os.walk(topdir): # Loop over files tot_size = 0 for file in files: try: fp = join(root, file) owner = getpwuid(stat(fp).st_uid).pw_name size = getsize(fp) atime = getatime(fp) mtime = getmtime(fp) ctime = getctime(fp) query = "INSERT INTO Files VALUES(?, ?, ?, ?, %d, %d, %d, %d)" % (size,atime,mtime,ctime) cur.execute(query, (fp,file,root,owner)) tot_size += size except: print "skipping", join(root, file) # Add total size of files in this directory to entry in Dirs table query = "UPDATE Dirs SET size=? WHERE fullpath=?" cur.execute(query, (tot_size,root)) # Loop over directories for dir in dirs: try: fp = join(root, dir) owner = getpwuid(stat(fp).st_uid).pw_name size = 0 atime = getatime(fp) mtime = getmtime(fp) ctime = getctime(fp) query = "INSERT INTO Dirs VALUES(?, ?, ?, ?, %d, %d, %d, %d)" % (size,atime,mtime,ctime) cur.execute(query, (fp,dir,root,owner)) except: print "skipping", join(root, file) print "Finished mapping files." print " " # At this point, the Dirs table has size columns that hold # only the size of the files directly in them, but not from # any subdirectories. Here, we scan through the Dirs table # and add the size of daughter directories to parent directories print "Updating directory sizes ..." cur.execute("INSERT INTO Info VALUES('directory update start time', ?)", (strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()),)) query = "SELECT fullpath,root FROM Dirs ORDER BY length(root) DESC" cur.execute(query) rows = cur.fetchall() for row in rows: try: query = "SELECT SUM(size) AS tot_size,COUNT() AS Nsubdirs FROM Dirs WHERE root=? OR fullpath=?" cur.execute(query,(row["fullpath"], row["fullpath"])) rows2 = cur.fetchall() Nsubdirs = rows2[0]["Nsubdirs"] if(Nsubdirs > 1): tot_size = rows2[0]["tot_size"] query = "UPDATE Dirs SET size=%d WHERE fullpath=?" cur.execute(query, (tot_size, row["fullpath"])) except: print "Unexpected error:", sys.exc_info()[0] # Fill table that holds condensed information by # user. For large numbers of files, queries to get this # info can take several seconds making it costly so it's # worthwhile to do it once now. print "Filling Users summary table ..." cur.execute("INSERT INTO Info VALUES('user summary start time', ?)", (strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()),)) query = "SELECT owner,SUM(size) AS tot FROM Files GROUP BY owner ORDER BY tot DESC" cur.execute(query) rows = cur.fetchall() for row in rows: query = "INSERT INTO Users VALUES(?, ?)" cur.execute(query, (row["owner"], row["tot"])) # Record end time of scan cur.execute("INSERT INTO Info VALUES('scan end time', ?)", (strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()),)) print "Done." print " "