""" Run the ABM with the specified batch file Usage help ---------- python abmdriver.py -h Release Notes ------------- Release 1.6 * support for update metric names * better error handling * cleanup """ # Copyright (C) 2007 by # Sasha Gutfraind # Distributed under the terms of the GNU Lesser General Public License # http://www.gnu.org/copyleft/lesser.html # import sys sys.path.append('/nfs/local64/Python-2.4.3s0.5.1/lib/python2.4/site-packages/') sys.path.append('/usr/lib64/python2.4/') sys.path.append('/use/local/Python-2.4.3s0.5.1/lib/python2.4/site-packages/') sys.path.append('rpy/') import numpy import matplotlib #for running scripts silently: #matplotlib.use('PS') import matplotlib.pylab as pylab #import pylab import scipy import re, os, sys try: import rpy except ImportError: print 'Couldn\'t import RPy. Statistical analysis functionality is deactivated.' def usage(): print "Script for running the ABM and analyzing the data" print "Allowed options are:" print "[-b ] Batch file to execute. Default: analysis1.pf" print "[-d ] Perform analysis on Dir in which subdirectories contain" print " simulation output files. " print " Default parsed from batch file \"outputDirectory\' parameter" print " Each subdirectory is assumed to contain runs with" print " the same control parameters, but different seeds." print "[-a ] y=perform batch and then analyze (default), " print " n=just run and not analyze, o=analyze Only." print "-h Displays this" print "eg." print r"To run and analyze a batch file" print r"python abmdriver.py -b analysis1.pf" print print r"To analyze without running the batch file(data generated in the past)" print r"python abmdriver.py -b analysis1.pf -a o" def initialize(): import getopt opts, args = getopt.getopt(sys.argv[1:], "b:d:a:h", [""]) batchFile = '' dataDir = '' analysisMode = 'runANDanalyze' for o, a in opts: if o in ("-h", "--help"): usage() sys.exit(0) if o in ("-b"): batchFile = a if o in ("-d"): dataDir = a analysisMode = 'analyzeOnly' if o in ("-a"): if (a == 'y') or (a == 'Y'): analysisMode = 'runANDanalyze' elif (a == 'n') or (a == 'N'): analysisMode = 'runANDstop' elif (a == 'o') or (a == 'O'): analysisMode = 'analyzeOnly' else: raise ValueError, 'Couldn\'t parse analysis mode' if not os.path.exists(batchFile) and not dataDir: raise ValueError, 'Batch file \"' + batchFile + '\" doesn\'t exist! Did you specify a batch file correctly (name after -b option, path has no \\)?' if dataDir == '' and analysisMode != 'runANDstop': try: bFile = os.fdopen(os.open(batchFile, 0)) except: raise IOError, 'Couldn\'t parse batch file for output directory. Make sure it is explicitly given in the batch file.' searchStarted = False for idx, line in enumerate(bFile): if 'outputDirectory' in line: searchStarted = True if searchStarted: m=re.search('.*set_string:\s*', line) if not isinstance(m, type(None)): dataDir = line[m.end():len(line)-1] dataDir = dataDir.strip() break if dataDir == '': raise IOError, 'Couldn\'t parse batch file for output directory. Make sure it is on next line after parameter name' if not os.path.isabs(dataDir): dataDir = os.getcwd()+os.sep+dataDir if os.path.exists(dataDir) and analysisMode != 'analyzeOnly': print '!!!!!!!!!!!!!!!Warning!!!!!!!!!!!!!!' print '' print ' outputDirectory already exists' print '' print '1) If you are repeating a run, make sure that the seeds are allowed to vary' print '2) If it\'s a new simulation, you probably want to change the outputDirectory' print '' userIn = raw_input('Do you want to continue [Y]/N?: ') if userIn == 'N' or userIn == 'n' or userIn == 'No' or userIn == 'NO': sys.exit(0) if not os.path.exists(dataDir) and analysisMode == 'analyzeOnly': print 'outputDirectory "%s" doesn\'t exist' %dataDir sys.exit(1) return (batchFile, dataDir, analysisMode) def parseDataFile(path): radicalData = {} try: f = os.fdopen(os.open(path, 0)) except: raise IOError, 'Can\'t read run report: ' + path headerIdx = 4 for idx, line in enumerate(f): if idx < headerIdx: continue elif idx == headerIdx: header = re.split(",", line) header.pop() break rawData = [] numSeries = len(header) for i in range(numSeries): rawData.append([]) for idx, line in enumerate(f): ar = re.split(",", line) for i in xrange(numSeries): rawData[i].append(float(ar[i])) for i in range(numSeries): radicalData[header[i]] = rawData[i] return radicalData def summarizeRadicalization(paramValStats): #paramValMeans = dict.fromkeys(paramValStats[0][0].keys(), 0) headers = paramValStats[0][0].keys(); tmp = [] for i in range(len(headers)): tmp.append([]) paramValMeans = dict(zip(headers, tmp)) for means, vars in paramValStats: #vars is not analyzed for k in paramValMeans: #paramValMeans[k] += means[k] paramValMeans[k].append(means[k]) #for k in paramValMeans: #paramValMeans[k] /= len(paramValStats) return paramValMeans def analyzeRunData(radicalData): means, vars = {}, {} for k in radicalData: #means[k] = radicalData[k][-1] means[k] = numpy.average(radicalData[k]) #vars[k] = numpy.std(radicalData[k]) return means, vars def reportStats(dataDir): dirList = os.listdir(dataDir) dirs = [] report = {} for path in dirList: path = dataDir+os.sep+path if os.path.isdir(path): dirs.append(path) numFiles = -1; garbageWarning = False for dir in dirs: paramValFiles = os.listdir(dir) if numFiles == -1: numFiles = len(paramValFiles) elif len(paramValFiles) != numFiles: garbageWarning = True paramValStats = [] for f in paramValFiles: if not "raw" in f: continue radicalData = parseDataFile(dir+os.sep+f) means, vars = analyzeRunData(radicalData) paramValStats.append([means, vars]) report[os.path.split(dir)[1]] = summarizeRadicalization(paramValStats) if garbageWarning: print 'Warning: At least one of the data directories includes an extra run. It could be garbage..' return report def sortPropernumerically(str1, str2): ar1 = re.split('=', str1) ar2 = re.split('=', str2) for i in range(len(ar1)): s1 = ar1[i] s2 = ar2[i] if not (s1[0].isdigit() or s1[0] in ('-', '+')) or not (s2[0].isdigit() or s2[0] in ('-', '+')): if s1 < s2: return -1 elif s1 > s2: return 1 else: continue else: try: f1 = float(s1) f2 = float(s2) except ValueError: if s1 < s2: return -1 elif s1 > s2: return 1 else: continue if f1 < f2: return -1 elif f1 > f2: return 1 else: continue return 0 def runRegression(statName, bestValues, seriesNames, f): f.write('\nRegression for ' + statName + '\n') xvals = numpy.array([], dtype=numpy.double) yvals = numpy.array([], dtype=numpy.double) f.write(statName + '\t' + 'Mean\n') for i, name in enumerate(seriesNames): Ys = numpy.array(bestValues[i], dtype=numpy.double) try: Xs = float(seriesNames[i]) * numpy.ones(Ys.size, dtype=numpy.double) except: Xs = float(i) * numpy.ones(Ys.size, dtype=numpy.double) xvals = numpy.concatenate((xvals, Xs)) yvals = numpy.concatenate((yvals, Ys)) f.write(seriesNames[i] + '\t' + str(Ys.mean()) + '\n') data={'x':xvals, 'y':yvals} lm_res = rpy.r.lm(rpy.r('y~x'), data) lm_sum = rpy.r.summary_lm(lm_res) lm_aov = rpy.r.summary_aov(lm_res) coeffs = lm_res['coefficients'].values() CI_m = rpy.r.confint(lm_res['call'], 'x', level=0.95)[0] CI_int = rpy.r.confint(lm_res['call'], '(Intercept)', level=0.95)[0] f.write('slope (95%% CI)\n%.4G'%(coeffs[0]) + '\n') f.write('[%.4G,%.4G]'%(CI_m[0],CI_m[1]) + '\n') f.write('intercept (95%% CI)\n%.4G'%(coeffs[1]) + '\n') f.write('[%.4G,%.4G]'%(CI_int[0],CI_int[1]) + '\n') f.write('R-sq\n%.4G'%(lm_sum['r.squared']) + '\n') f.write('R-sq-adj\n%.4G'%(lm_sum['adj.r.squared']) + '\n') f.write('f-stat\n%.4G'%(lm_aov['F value'][0]) + '\n') f.write('Pr(>F)\n%.4G'%(lm_aov['Pr(>F)'][0]) + '\n') def displayStats(dataDir, report): import time series = report.keys() series.sort(sortPropernumerically) try: if rpy.r: do_tests = True except: print 'Couldn\'t import RPy... No statistical analysis will be done.' do_tests = False #for k in report: # print k + ': ' + str(report[k]) valuableStats = ['fracRadicals', 'radicalNeighbZeal', 'radicalClustering', 'Fraction Radicals', 'Radical Dyads', 'Radical Triads'] if do_tests: f = open(dataDir + os.sep + 'stats_' + str(time.localtime()[3:-1]) + '.txt', 'w') s = series[0] paramName = s[:s.find('=')] f.write('Effect of parameter: %s\n'%paramName) for statName in valuableStats: dataOverParamValues = [] try: for k in series: #TODO: add a test here to check all dirs have the same number of files. dataOverParamValues.append(report[k][statName]) except: print 'Unable to read metric: ' + statName + '. It might be obsolete.' continue pylab.figure() try: pylab.boxplot(numpy.array(dataOverParamValues).transpose()) except: print 'Couldn\'t produce box plot. Make sure all parameter test groups have the same size.' raise seriesAbr = [] for i in range(len(series)): s = series[i] seriesAbr.append(s[s.find('=')+1:]) pylab.xlabel(paramName) pylab.xticks(range(1,len(report)+1), seriesAbr) pylab.ylabel(statName) pylab.savefig(dataDir + os.sep + statName + str(time.localtime()[3:-1]) + '.eps') try: if do_tests: runRegression(statName, dataOverParamValues, seriesAbr, f) except: print 'Could not perform regression for ' + statName if do_tests: f.close() #pylab.show() print 'Done' if __name__ == "__main__": batchFile, dataDir, analysisMode = initialize() if analysisMode == 'runANDanalyze' or analysisMode == 'runANDstop': #see java -X | more for documentation runCmd = 'java -jar radicalization.jar -b ' + batchFile #runCmd = 'java -Xms500m -Xmx2G -jar radicalization.jar -b ' + batchFile print 'Running: ' + runCmd if os.system(runCmd): raise Exception, 'Batch execution failed.' if analysisMode == 'runANDanalyze' or analysisMode == 'analyzeOnly': print 'Starting data analysis on directory:' print dataDir report = reportStats(dataDir) displayStats(dataDir, report) #TODO: copy the batch file to the dataDir for reference