#!/usr/bin/env python # # This script is run by a cron job on both gluonraid1 and gluonraid2 using # the hdsys account every 5 minutes. It's job is to scan the "active" directory # on the RAID disk for files that should be copied to the tape library. Any files # found in the /raid/rawdata/active/$RUN_PERIOD directory that have not been # modified for at least MIN_TIME_SINCE_MODIFICATION minutes (set via variable # below) will be moved to the "volatile" directory (preserving the directory # structure relative to "active"). At the same time, a hard link will be made # in the corresponding "staging" directory pointing back to the file in the # "volatile" directory. # # The exception for this are files within a subdirectory # whose path contains "/RunLog". Files in these directories are small configuration # files and copying them individually to tape is inefficient and we have been # asked not to do that. Instead, the "/RunLog*" directories are scanned for # thier newest files. If the newest file in the whole directory tree is at # least MIN_TIME_SINCE_MODIFICATION_RUNLOG minutes old, then a tarball of the # RunLog directory is made in "volatile" and a hard link to that made in # "staging" so that the tarball is copied to tape. # # A cronjob run from the root account on gluonraid1(2) will check for files in # "staging" directory and will copy them to the tape library if they have # not been modified for at least 5 minutes. Once a file is copied to the # tape library, it is unlinked in "staging". This leaves the copy that is # in "volatile" so it can be used for offline analysis on the counting house # computers. A separate cron job will periodically remove files from the # "volatile" directory in order to make space on the RAID disk. # # Important Values: # # RUN_PERIOD is taken from the environment variable which is set in the # /gluex/etc/hdonline.cshrc file # # HDLOG is taken from the environment variable which is set in the # /gluex/etc/hdonline.cshrc file. This points to the directory # where the log file for this script is kept. The log file is # probably named stage_to_tape.log and is probably: # /gluex/log/stage_to_tape.log # # # This script can be run from any directory # import os import sys import getpass import socket import subprocess import time from os.path import join, getsize, getatime, getmtime, getctime from os import stat from pwd import getpwuid from time import gmtime, localtime, strftime VERBOSE=0 TEST_MODE=False MIN_TIME_SINCE_MODIFICATION = 5.0 # time in minutes MIN_TIME_SINCE_MODIFICATION_RUNLOG = 120.0 # time in minutes NOW = time.time() HDLOG = os.getenv('HDLOG', '.') LOGFILE= '%s/stage_totape.log' % HDLOG if '-v' in sys.argv: VERBOSE=1 # allow command line to turn on verbose messageing (for debugging) #---------------------------- # printlog def printlog(mess): if TEST_MODE: print mess else: tmess = '%s : %s :: %s' % (socket.gethostname(), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), mess) with open(LOGFILE, "a") as logfile: logfile.write(tmess+'\n') #---------------------------- # Print start time to log file printlog('----- START %s -----' % time.ctime(NOW)) RUN_PERIOD = os.getenv('RUN_PERIOD') if(RUN_PERIOD == None): printlog('The "RUN_PERIOD" environment variable is not defined!"') printlog('Please set it to something like the following and re-run:') printlog(' ') printlog(' RunPeriod-2014-10') sys.exit(-1) RAID_DIR = '/raid/rawdata' # always use local RAID disk ACTIVE = RAID_DIR + '/active/' + RUN_PERIOD STAGING = RAID_DIR + '/staging/' + RUN_PERIOD VOLATILE = RAID_DIR + '/volatile/' + RUN_PERIOD if VERBOSE>0: print ' ' print 'RUN_PERIOD : ' + RUN_PERIOD print ' ACTIVE : ' + ACTIVE print ' STAGING : ' + STAGING print ' VOLATILE : ' + VOLATILE print ' ' #---------------------------- # mkdirpath def mkdirpath(path): try: dirs = path.split('/') p = '' for dir in dirs: if dir == '': continue p += '/' + dir if not os.path.exists(p): os.mkdir(p) except Exception,e: printlog(str(e)) #---------------------------- # Walk directory tree, looking for files printlog("Searching directory: " + ACTIVE) dirs_to_make = [] files_to_move = [] for root, dirs, files in os.walk(ACTIVE): # Loop over files for file in files: try: fp = join(root, file) fullpath = '%s' % fp relpath = fullpath[len(ACTIVE)+1:] entry = getpwuid(stat(fp).st_uid) owner = entry.pw_name mtime = getmtime(fp) mdiff = (NOW - mtime)/60.0 # time since modification in minutes # Files must not have been modified recently if mdiff0: print 'tarball already exists. skipping: ' + fullpath_tarball except Exception,e: printlog(str(e)) printlog("Unexpected error:" + str(sys.exc_info()[0])) printlog("skipping" + join(root, file)) # Print end time to log file printlog('----- END %s -----' % time.ctime(NOW))