#!/usr/bin/env python # # OVERVIEW # ------------------ # The job of this script is to change the symbolic links # in the /gluex/rawdata directory to point to the appropriate # partitions on gluonraid3.jlab.org. If the links are updated # then the hd_stage_to_tape.py script is launched in the # background to start immediately copying any files in the # "prev" directory to tape. # # This script may be run from : # # 1. CODA DAQ system run_prestart_sync script # 2. command line # # One may pass a single argument to this script that is one # of "DAQ", or "COMMAND_LINE" corresponding to the above. # If no argument is passed then "COMMAND_LINE" is assumed. # This is purely informational so it can record in the log # file how the program was run. # # # DESCRIPTION # ------------------ # The links are rotated # every time a new run is started so that the previous run # may be copied to tape from one set of physical disks # without interfering with the current run which is writing # to a different set of physical disks. Each partition uses # a dedicated set of physical disks in the gluonraid3 server. # We switch partitions for each run so that the copy to tape # may proceed as soon as each run is complete. Otherwise, we # would have to wait for the partition to fill causing a delay # in getting the data to tape and making it available for # processing on the Scientific Computing farm. # # An instance of the hd_stage_to_tape.py script is launched # as soon as the links are updated. Some trouble is taken # to run the program as "hdsys" on gluonraid3 and in a detached # process. The actual command is something like: # # > nohup ssh gluonraid3 run_as_user hd_stage_to_tape.py # # # Since a long break between runs may occur, the # hd_stage_to_tape.py script is also run via cronjob to make # sure the most recent runs are copied to tape in a timely manner. # When run from a cronjob, all paritions are examined and an # algorithm decides if any should be processed. See comments # in the hd_stage_to_tape.py script for details. # # One may also need to rotate the links "by hand" so it's # possible this may be run from the command line as well. # # Having multiple, asynchronous initiators of this script # means some coordination is needed. We do this via a lock # file: # # /gluonraid3/data1/hd_rotate_raid_links.pid # # The script will open the file and then try to obtain an # exclusive lock on it. If another process has it locked, # then the links are likely in the process of being rotated # already. Thus, this script will simply exit without changing # anything. Testing has shown that if this script is killed # while it holds the lock, then the lock is automatically freed. # # # import os, sys, fcntl, time, errno, calendar, exceptions import subprocess # List of partitions to include in rotation PARTITION_LIST = [] #PARTITION_LIST.append('/gluonraid3/data1') #PARTITION_LIST.append('/gluonraid3/data2') #PARTITION_LIST.append('/gluonraid3/data3') #PARTITION_LIST.append('/gluonraid3/data4') PARTITION_LIST.append('/gluonraid4/data1') PARTITION_LIST.append('/gluonraid4/data2') PARTITION_LIST.append('/gluonraid4/data3') PARTITION_LIST.append('/gluonraid4/data4') LOCKFILENAME = '/tmp/hd_rotate_raid_links.pid' SYMLINK_CURR = '/gluex/data/rawdata/curr' SYMLINK_PREV = '/gluex/data/rawdata/prev' SYMLINK_NEXT = '/gluex/data/rawdata/next' INITIATOR = 'COMMAND_LINE' for arg in sys.argv[1:]: if arg.startswith('-'): pass else: INITIATOR = arg print '--- hd_rotate_raid_links.py starting- pid: %d ---' % os.getpid() print 'Initiated by: %s on %s' % (INITIATOR, time.ctime()) # Open the lock file, creating it if necessary try: fd = os.open( LOCKFILENAME, os.O_CREAT | os.O_RDWR); fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) # Lock file is now opened and locked print 'Opened and locked file: %s' % LOCKFILENAME os.write(fd, '%d\n' % os.getpid()) # Find which partition "curr" is pointing to icurr = 0 # existing "curr" partition (1,2,3 or 4) try: scurr = os.readlink(SYMLINK_CURR) for part in PARTITION_LIST: if part == scurr: break icurr += 1 print ' current symlink: %s' % scurr except OSError as e: print ' error checking symlink %s' % SYMLINK_CURR print ' using defaults' icurr += 1 MAXPARTITION = len(PARTITION_LIST) if icurr>=MAXPARTITION : icurr = 0 inext = icurr + 1 if inext>=MAXPARTITION : inext = 0 iprev = icurr - 1 if iprev<0 : iprev = MAXPARTITION-1 if iprev<0 : iprev = 0 # Remove old symlinks try: os.unlink(SYMLINK_PREV) except: pass try: os.unlink(SYMLINK_CURR) except: pass try: os.unlink(SYMLINK_NEXT) except: pass try: os.unlink('/gluondaqfs/'+SYMLINK_PREV) except: pass try: os.unlink('/gluondaqfs/'+SYMLINK_CURR) except: pass try: os.unlink('/gluondaqfs/'+SYMLINK_NEXT) except: pass # Create new symlinks try: os.symlink( PARTITION_LIST[iprev], SYMLINK_PREV) except Exception as e: print 'Unable to create symlink: %s\n%s' % (SYMLINK_PREV, str(e)) try: os.symlink( PARTITION_LIST[icurr], SYMLINK_CURR) except: print 'Unable to create symlink: %s' % SYMLINK_CURR try: os.symlink( PARTITION_LIST[inext], SYMLINK_NEXT) except: print 'Unable to create symlink: %s' % SYMLINK_NEXT try: os.symlink( PARTITION_LIST[iprev], '/gluondaqfs/'+SYMLINK_PREV) except Exception as e: print 'Unable to create symlink: %s\n%s' % ('/gluondaqfs/'+SYMLINK_PREV, str(e)) try: os.symlink( PARTITION_LIST[icurr], '/gluondaqfs/'+SYMLINK_CURR) except: print 'Unable to create symlink: %s' % '/gluondaqfs/'+SYMLINK_CURR try: os.symlink( PARTITION_LIST[inext], '/gluondaqfs/'+SYMLINK_NEXT) except: print 'Unable to create symlink: %s' % '/gluondaqfs/'+SYMLINK_NEXT print '' print ' New symlinks are:' print ' %s -> %s' % (SYMLINK_PREV, PARTITION_LIST[iprev]) print ' %s -> %s' % (SYMLINK_CURR, PARTITION_LIST[icurr]) print ' %s -> %s' % (SYMLINK_NEXT, PARTITION_LIST[inext]) print ' %s -> %s' % ('/gluondaqfs/'+SYMLINK_PREV, PARTITION_LIST[iprev]) print ' %s -> %s' % ('/gluondaqfs/'+SYMLINK_CURR, PARTITION_LIST[icurr]) print ' %s -> %s' % ('/gluondaqfs/'+SYMLINK_NEXT, PARTITION_LIST[inext]) print '' # Unlock and close lock file print 'Releasing lock and closing %s' % LOCKFILENAME fcntl.flock(fd, fcntl.LOCK_UN) os.close(fd) # Launch hd_stage_to_tape.py process to start copying files to tape # (n.b. the hd_stage_to_tape.py script is also run via cron job from # the hdsys account on gluonraid3) modified_env = os.environ.copy() modified_env['HOME'] = '/home/hdsys' cmd = ['nohup', 'ssh', 'gluonraid4', 'run_as_user', 'hd_stage_to_tape.py'] #print ' Running hd_stage_to_tape.py: ' + str(cmd) #subprocess.Popen(cmd, env=modified_env) print 'Disabled running of hd_stage_to_tape.py for now ...' print 'finished %s' % time.ctime() print '----------------------------------------------------' except IOError as e: print 'Unable to obtain lock on file: %s (%d)' % (LOCKFILENAME, e.errno) print '"%s"' % str(e) if e.errno == errno.EAGAIN: sys.exit(0) # another process holds lock else: sys.exit(-1) # something else went wrong