#!/usr/bin/env python
#
# This currently reads from a file created by running the following
# commands on Cori:
#
#  > cd ~/builds/accounting
#  > sacct --format=JobID%15,Submit,Start,End,NCPUS,CPUTimeRaw,ResvCPURAW,MaxRSS,JobName%30,ExitCode,MaxDiskRead -S "2018-08-21 11:00" > slurm.dat
#
# Then running this to copy to the local machine:
#
#  local.jlab> scp cori.nersc.gov:builds/accounting/slurm.dat .
#
#
# If an argument is given it is taken as the date/time for which all of the
# times written to the CSV file are relative. The format should be something
# like:  2018-09-11T10:37:23
# If no argument is given then the earliest submit time is used. 
#
# If a second argument is given then it is considered the maximum submit time to
# be written to the CSV file. The ROOT macros will use the latest time
# to determine the right-and edge of the time axis. Note that times earlier
# than the start time (first argument) will end up with negative values in
# the CSV file and therefore be outside of the plot range. Time later than
# the end time will not show up in the CSV file at all.
#
# n.b. the sqlite file will contain the actual date/time. Only the csv file
# uses relative times (in seconds) to make it easier to handle in ROOT. It
# is necessary then to tell the ROOT macros the start time used for the
# time axis to be displayed correctly.
#
#
#  Field definitions:
#
#  JobID%15    - JobID. The "%15" part is so it will print up to 15 characters of the
#                id. Each job actually has 3 steps and each shows up as a line with the
#                same numerical jobid, but with the second two having ".batch" and
#                ".extern" appended.
#
# Submit       - Submit time in YYY-MM-DDTHH:MM:SS format. Note that the "T" is always
#                there and seems to separate the Date and Time sections.
#
# Start        - Start time of the job in same format as Submit
#
# End          - End time of the job in same format as Submit
#
# NCPUS        - Number of CPUs assigned to the job. This looks to always be 64. This
#                counts both full cores and hyperthreads since the nodes have only 32
#                full cores.
#
# CPUTimeRaw   - Amount of time taken by the job in CPU-seconds. Divide this by NCPUS
#                to get the amount of time a single core was in use.
#
# ResvCPURaw   - The amount of CPU-seconds the job was waiting to run. This is a funny
#                unit to use here since there is not any actual CPU usage going on.
#                Divide this by NCPUS to get actual seconds the job was waiting
#                between Submit and Start.
#
# MaxRSS       - Maximum Resident Set Size or memory used. This is printed with a "K"
#                suffix which I assume is kB.
#
# JobName%30   - Job name. This should be something like "GLUEX_offmon_041008_005".
#                It is the only way I can see to get at the run/file numbers of
#                the job. This may actually be a potential issue since some jobs
#                may get retried after failure and they will have the same name.
#                The "%30" is just to make the number of printed characters large
#                enough to hold the whole name. Note also that the "batch" and "extern"
#                steps will have names that are just that.
#
# ExitCode     - This holds both the exit code and the signal type that caused the
#                exit like this:   exit_code:signal_type
#


import sqlite3
import csv
import sys

start_time = None
end_time   = None

if len(sys.argv)>1: start_time = sys.argv[1]
if len(sys.argv)>2:   end_time = sys.argv[2]

with open('slurm.dat') as f:

	conn = sqlite3.connect('slurm.db')
	c = conn.cursor()
	try:
		c.execute('''DROP TABLE IF EXISTS jobs''')  # force recreation of table to avoid duplicate entries
		c.execute('''CREATE TABLE jobs (ID integer, Submit text, Start text, End text, NCPUS integer, CPUTIME integer, MaxRSS_GB float, Latency real, JobName text, job_type text, Run integer, file integer, ExitCode integer)''')
	except:
		print 'Error creating table'

	for line in f :
		# Split line into tokens and extract numerical id
		vals = line.split();
		id = vals[0].split('.')[0]
		
		# Skip header lines and unfinished jobs
		#if vals[3] == 'Unknown': continue
		if vals[0] == 'JobID' or vals[0].startswith('---'): continue

		# Each job is spread over 3 lines with much of the info redundant among them.
		# Extract certain values from certain lines and assume the "extern" line is
		# the last.
		if vals[0].endswith('.batch'):
			if 'K' in vals[6]:
				MaxRSS_GB = float(vals[6].split('K')[0])/1000000.0  # Non-batch line skips the MaxRSS field so JobName is 6th value
			if 'M' in vals[6]:
				MaxRSS_GB = float(vals[6].split('M')[0])/1000.0  # Non-batch line skips the MaxRSS field so JobName is 6th value
			c.execute('UPDATE jobs SET MaxRSS_GB=%f WHERE ID=%s' % (MaxRSS_GB, id))
		elif vals[0].endswith('.extern'):
			for i in range(0,len(vals)):
				if vals[i] == 'extern':
					if len(vals)>(i+1):
						ExitCode = int(vals[i+1].split(':')[0])  # "extern" line skips ResvCPURAW field so ExitCode is 8th value
						c.execute('UPDATE jobs SET ExitCode=%d WHERE ID=%s' % (ExitCode, id))
						break
		elif vals[0].endswith('.0'):
			pass # early test jobs seem to have this with NCPUS=1
		else:
			# This should be the first of the 3 lines or only one if just one exists
			Submit   = vals[1]
			Start    = vals[2]
			End      = vals[3] if vals[3]!='Unknown' else '2018-01-01'
			NCPUS    = int(vals[4])
			CPUTIME  = vals[5]  # cpu-seconds
			Latency  = float(vals[6])/float(NCPUS)  # wait time between submit and start in seconds
			JobName  = vals[7]  # Non-batch line skips the MaxRSS field so JobName is 7th value
			nvals    = JobName.split('_')
			if len(nvals)>=4:
				if '_part' in JobName or '_aggre' in JobName:
					job_type = nvals[1]
					run      = int(nvals[-3])
					file     = int(nvals[-2])
				else:
					job_type = nvals[1]
					run      = int(nvals[-2])
					file     = int(nvals[-1])
			else:
				job_type='test'
				run = 12345
				file = 0
			ExitCode = 0
			MaxRSS_GB = 0
			#CPUTIME = 0
			if Start == 'Unknown' : Start = '1970-01-04'
			if len(vals)>8 : ExitCode = int(vals[8].split(':')[0])
			myvals =                           (id    ,Submit , Start , End , NCPUS , CPUTIME , MaxRSS_GB, Latency , JobName , job_type , run , file , ExitCode)
			c.execute('INSERT INTO jobs VALUES ( ?    ,   ?   ,  ?    ,   ? ,   ?   ,    ?    ,    ?     ,     ?   ,    ?    ,     ?    ,  ?  ,   ?  ,   ?     )', myvals)

	# Flush date to sqlite file and close it
	conn.commit()
	conn.close()

	# Open the SQLite file back up and read data from it
	conn = sqlite3.connect('slurm.db')
	c = conn.cursor()
	
	# Use either earliest submit time or user specified start time
	sql = 'SELECT MIN(strftime("%s",Submit))  FROM jobs'
	if start_time != None: sql = 'SELECT strftime("%s","' + start_time + '")'
	c.execute(sql)
	tmin = str(c.fetchone()[0])

	# Use either current time or user specified end time
	sql = 'SELECT strftime("%s")'  # i.e. now
	if end_time != None: sql = 'SELECT strftime("%s","' + end_time + '")'
	print sql
	c.execute(sql)
	tmax = str(c.fetchone()[0])
	print 'tmax='+tmax
	
	sql = 'SELECT strftime("%s",Submit)-' + tmin
	sql += ',strftime("%s",Start)-' + tmin
	sql += ',strftime("%s",End)-' + tmin
	sql += ',CPUTIME'
	#sql += ',LATENCY'
	sql += ',CAST(strftime("%s",Start) as integer) - CAST(strftime("%s",Submit) as integer)'
	sql += ',NCPUS'
	sql += ',Run'
	sql += ',file'
	sql += ' FROM jobs'
	sql += ' WHERE CAST(strftime("%s",Submit) as integer)<' + tmax
	print sql

	ofile = open('slurm.csv', 'w')
	csvwriter = csv.writer(ofile)
	for row in c.execute(sql):
		csvwriter.writerow(row)
	ofile.close()