-
Notifications
You must be signed in to change notification settings - Fork 4
/
createAndSubmitJobsWithCrab3.py
406 lines (369 loc) · 17.6 KB
/
createAndSubmitJobsWithCrab3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
#!/usr/bin/env python
import subprocess
import os
import sys
import string
from optparse import OptionParser
import re
from datetime import datetime
import shutil
import math
from multiprocessing import Process,Queue
try:
from CRABClient.UserUtilities import config, getUsernameFromSiteDB
except ImportError:
print
print 'ERROR: Could not load CRABClient.UserUtilities. Please source the crab3 setup:'
print 'source /cvmfs/cms.cern.ch/crab3/crab.sh'
exit(-1)
# now we should be able to import all the crab stuff
from CRABAPI.RawCommand import crabCommand
from httplib import HTTPException
# Define valid global tags by dataset as noted here:
# https://twiki.cern.ch/twiki/bin/viewauth/CMS/PdmVAnalysisSummaryTable
globalTagsByDataset = {}
# latest miniaod v2
globalTagsByDataset['RunIISummer16*'] = '102X_mcRun2_asymptotic_v7'
globalTagsByDataset['Run2016*'] = '102X_dataRun2_v11'
globalTagsByDataset['RunIIFall17*'] = '102X_mc2017_realistic_v7'
globalTagsByDataset['Run2017*'] = '102X_dataRun2_v11'
globalTagsByDataset['RunIIAutumn18*'] = '102X_upgrade2018_realistic_v19'
globalTagsByDataset['Run2018D*'] = '102X_dataRun2_Prompt_v14'
globalTagsByDataset['Run2018A*'] = '102X_dataRun2_v11'
globalTagsByDataset['Run2018B*'] = '102X_dataRun2_v11'
globalTagsByDataset['Run2018C*'] = '102X_dataRun2_v11'
# to feed additional files into the crab sandbox if needed
additionalInputFiles = []
#rootTupleTestDir = os.getenv('CMSSW_BASE')+'/src/Leptoquarks/RootTupleMakerV2/test/'
# just feed both in, even though we only need one at a time
#additionalInputFiles.append(rootTupleTestDir+'Summer16_23Sep2016V4_MC.db')
#additionalInputFiles.append(rootTupleTestDir+'Summer16_23Sep2016AllV4_DATA.db')
def crabSubmit(config):
try:
crabCommand('submit', config = config)
except HTTPException, hte:
print '-----> there was a problem. see below.'
print hte.headers
print 'quit here'
q.put(-1)
q.put(0)
def validateOptions(options):
error = False
if options.localStorageDir is None:
error = True
elif options.inputList is None:
error = True
if error:
print 'You are missing one or more required options: d, i'
parser.print_help()
exit(-1)
if options.prevJsonFile is not None and options.jsonFile is None:
print 'It does not make sense to specify a previously used/analyzed JSON file without specifying a new JSON file, since with this option specified, the difference between the new and old JSON is taken as the lumi mask.'
exit(-1)
def makeDirAndCheck(dir):
if not os.path.exists(dir):
os.makedirs(dir)
else:
# in practice, this doesn't happen because of the seconds in the name, but always good to check
print 'ERROR: directory %s already exists. Not going to overwrite it.' % dir
exit(-2)
##############################################################
# RUN
##############################################################
#---Option Parser
#--- TODO: WHY PARSER DOES NOT WORK IN CMSSW ENVIRONMENT? ---#
usage = "Usage: %prog [options] "
#XXX TODO FIX/UPDATE THIS MESSAGE
usage+="\nSee https://twiki.cern.ch/twiki/bin/view/CMS/ExoticaLeptoquarkShiftMakeRootTuplesV22012 for more details "
usage+="\nExample1 (NORMAL MODE): %prog -d `pwd`/RootNtuple -i inputList.txt"
usage+="\nExample2 (NORMAL MODE + RUN SELECTION): %prog -d `pwd`/RootNtuple -i inputList.txt -r 132440-200000 "
usage+="\nExample3 (JSON MODE): %prog -d `pwd`/RootNtuple -i inputList.txt -j [JSON.txt or URL, https://cms-service-dqm.web.cern.ch/cms-service-dqm/CAF/certification/Collisions12/8TeV/Prompt/Cert_190456-208686_8TeV_PromptReco_Collisions12_JSON.txt]"
usage+="\nExample4 (PREV JSON MODE): %prog -d `pwd`/RootNtuple -i inputList.txt -j [JSON.txt or URL, https://cms-service-dqm.web.cern.ch/cms-service-dqm/CAF/certification/Collisions12/8TeV/Prompt/Cert_190456-208686_8TeV_PromptReco_Collisions12_JSON.txt] -p [lumiSummary.json from crab report from previous processing of same dataset]"
parser = OptionParser(usage=usage)
parser.add_option("-d", "--localStorageDir", dest="localStorageDir",
help="the directory localStorageDir is where the local job info is kept",
metavar="INDIR")
parser.add_option("-v", "--tagName", dest="tagName",
help="tagName of RootNTupleMakerV2",
metavar="TAGNAME",
default="")
parser.add_option("-i", "--inputList", dest="inputList",
help="list of all datasets to be used (full path required)",
metavar="LIST")
parser.add_option("-e", "--eosDir", dest="eosDir",
help="EOS directory (start with /store...) to store files (used for Data.outLFNDirBase); otherwise EXO LJ group dir used with userName",
metavar="EOSDIR")
parser.add_option("-j", "--json", dest="jsonFile",
help="JSON file with selected lumi sections",
metavar="JSONFILE")
parser.add_option("-r", "--run range", dest="runRange",
help="selected run range",
metavar="RUNRANGE")
parser.add_option("-p", "--previousJSON json", dest="prevJsonFile",
help="previous lumiSummary.json from crab",
metavar="PREVJSON")
parser.add_option("-s", "--site siteName", dest="storageSite",
help="storage site",
metavar="STORAGESITE",
default="T2_CH_CERN")
(options, args) = parser.parse_args()
# validate options
validateOptions(options)
# time: YYYYMMDD_HHMMSS
date = datetime.now()
#dateString = date.strftime("%Y%m%d_%H%M%S")
# I like this better, but does it break anything?
dateString = date.strftime("%Y%b%d_%H%M%S")
if options.tagName:
topDirName = 'lqCustomNanoAOD_'+options.tagName+'_'+dateString
else:
topDirName = 'lqCustomNanoAOD_'+dateString
productionDir = options.localStorageDir+'/'+topDirName
cfgFilesDir = productionDir+'/cfgfiles'
outputDir = productionDir+'/output'
workDir = productionDir+'/workdir'
localDirs = [productionDir,cfgFilesDir,outputDir,workDir]
print 'Making local directories:'
for dir in localDirs:
print '\t',dir
makeDirAndCheck(dir)
print
localInputListFile = productionDir+'/inputList.txt'
shutil.copy2(options.inputList,localInputListFile)
# check if we have a proxy
proc = subprocess.Popen(['voms-proxy-info','--all'],stderr=subprocess.PIPE,stdout=subprocess.PIPE)
out,err = proc.communicate()
#print 'output----->',output
#print 'err------>',err
if 'Proxy not found' in err or 'timeleft : 00:00:00' in out:
# get a proxy
print 'you have no valid proxy; let\'s get one via voms-proxy-init:'
# this will suppress the stderr; maybe that's not so good, but I get some error messages at the moment
#with open(os.devnull, "w") as f:
# proc2 = subprocess.call(['voms-proxy-init','--voms','cms','--valid','168:00'],stderr=f)
proc2 = subprocess.call(['voms-proxy-init','--voms','cms','--valid','168:00'])
# setup general crab settings
# from https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRABClientLibraryAPI
#TODO: this will work for MC. Need to update to run over data.
# notes on how the output will be stored: see https://twiki.cern.ch/twiki/bin/view/CMSPublic/Crab3DataHandling
# <lfn-prefix>/<primary-dataset>/<publication-name>/<time-stamp>/<counter>[/log]/<file-name>
# LFNDirBase / / datasetTagName / stuff automatically done / from outputFile defined below
config = config()
config.General.requestName = topDirName # overridden per dataset
config.General.transferOutputs = True
config.General.transferLogs = False
# We want to put all the CRAB project directories from the tasks we submit here into one common directory.
# That's why we need to set this parameter (here or above in the configuration file, it does not matter, we will not overwrite it).
config.General.workArea = productionDir
#
config.JobType.pluginName = 'Analysis'
config.JobType.maxMemoryMB = 3000
# feed in any additional input files
if len(additionalInputFiles) > 0:
config.JobType.inputFiles = []
config.JobType.inputFiles.extend(additionalInputFiles)
config.JobType.psetName = '' # overridden per dataset
config.Data.inputDataset = '' # overridden per dataset
config.Data.inputDBS = 'global'
config.Data.splitting = 'Automatic' # below this is set to Automatic for data, FileBased for MC
config.Data.totalUnits = -1 # overridden per dataset, but doesn't matter for Automatic splitting
# no publishing
config.Data.publication = False
config.Data.outputDatasetTag = 'LQ' #overridden for data
#This is for EXO group space
if options.tagName:
config.Data.outLFNDirBase = '/store/group/phys_exotica/leptonsPlusJets/RootNtuple/RunII/%s/' % (getUsernameFromSiteDB()) + options.tagName + '/'
else:
config.Data.outLFNDirBase = '/store/group/phys_exotica/leptonsPlusJets/RootNtuple/RunII/%s/' % (getUsernameFromSiteDB()) + '/'
#This is for Higgs group space
#config.Data.outLFNDirBase = '/store/group/phys_higgs/HiggsExo/HH_bbZZ_bbllqq/%s/' % (getUsernameFromSiteDB()) + options.tagName + '/'
#This is for personal user space (beware quotas)
#config.Data.outLFNDirBase = '/store/user/%s/' % (getUsernameFromSiteDB()) + topDirName + '/'
if options.eosDir is not None:
# split of /eos/cms if it is there
if options.eosDir.startswith('/eos/cms'):
options.eosDir = options.eosDir.split('/eos/cms')[-1]
# require /store unless it's CERNBOX
if options.storageSite!='T2_CH_CERNBOX' and not options.eosDir.startswith('/store'):
print 'eosDir must start with /eos/cms/store or /store and you specified:',options.eosDir
print 'quit'
exit(-1)
outputLFN=options.eosDir
if not outputLFN[-1]=='/':
outputLFN+='/'
if options.tagName:
outputLFN+=options.tagName+'/'
if not getUsernameFromSiteDB() in outputLFN:
outputLFN.rstrip('/')
#config.Data.outLFNDirBase = outputLFN+'/%s/' % (getUsernameFromSiteDB()) + topDirName + '/'
# make the LFN shorter, and in any case, the timestamp is put in by crab
if options.tagName:
config.Data.outLFNDirBase = outputLFN+'/%s/' % (getUsernameFromSiteDB()) + options.tagName + '/'
else:
config.Data.outLFNDirBase = outputLFN+'/%s/' % (getUsernameFromSiteDB()) + '/'
else:
config.Data.outLFNDirBase = outputLFN
print 'Using outLFNDirBase:',config.Data.outLFNDirBase
config.Site.storageSite = options.storageSite
# look at the input list
# use DAS to find the dataset names.
# Example:
# das_client.py --query="dataset=/LQToUE_M-*_BetaOne_TuneCUETP8M1_13TeV-pythia8/*/MINIAODSIM"
with open(localInputListFile, 'r') as f:
for line in f:
split = line.split()
if len(split) <= 0:
continue
if '#' in split[0]: # skip comments
#print 'found comment:',line
continue
if len(split) < 3:
print 'inputList line is not properly formatted:',line
exit(-3)
dataset = split[0]
nUnits = int(split[1]) #also used for total lumis for data
nUnitsPerJob = int(split[2]) # used for files/dataset for MC
datasetNoSlashes = dataset[1:len(dataset)].replace('/','__')
# datasetNameNoSlashes looks like SinglePhoton__Run2015D-PromptReco-v3
# so split to just get Run2015D-PromptReco-v3
# and use that as the outputDatasetTag to get it into the EOS path
primaryDatasetName = datasetNoSlashes.split('__')[0]
secondaryDatasetName = datasetNoSlashes.split('__')[1]
print 'primaryDatasetName={}'.format(primaryDatasetName)
print 'secondaryDatasetName={}'.format(secondaryDatasetName)
datasetName = datasetNoSlashes
datasetName = datasetName.split('__')[0]+'__'+datasetName.split('__')[1] # get rid of part after last slash
thisWorkDir = workDir+'/'+datasetName
isData = 'Run20' in datasetName
if not isData:
datasetName=datasetName.split('__')[0]
config.Data.splitting = 'FileBased'
config.Data.unitsPerJob = nUnitsPerJob
else:
config.Data.outputDatasetTag=secondaryDatasetName
config.Data.splitting = 'Automatic'
# get era
if 'Summer16' in secondaryDatasetName or 'Run2016' in secondaryDatasetName:
year=2016
elif 'Fall17' in secondaryDatasetName or 'Run2017' in secondaryDatasetName:
year=2017
elif 'Autumn18' in secondaryDatasetName or 'Run2018' in secondaryDatasetName:
year=2018
else:
print 'ERROR: could not determine year from secondaryDatasetName "{0}" from datasetName "{1}"'.format(secondaryDatasetName,datasetName)
exit(-4)
#Handle the ext1 vs non ext case specially
print 'datasetName={}'.format(datasetName)
if not isData:
if 'ext' in dataset:
extN = dataset[dataset.find('_ext')+4]
datasetName=datasetName+'_ext'+extN
config.Data.outputDatasetTag='LQ_ext'+extN
if 'backup' in dataset:
datasetName=datasetName+'_backup'
config.Data.outputDatasetTag='LQ_backup'
#This is for DY 10-50 which has a v1 and v2, and an ext1
if '-v2' in dataset:
datasetName=datasetName+'-v2'
config.Data.outputDatasetTag='LQ-v2'
elif '-v1' in dataset:
datasetName=datasetName+'-v1'
config.Data.outputDatasetTag='LQ-v1'
config.Data.inputDataset = dataset
#print 'make dir:',thisWorkDir
makeDirAndCheck(thisWorkDir)
outputFileNames = []
outputFileNames.append(dataset[1:dataset.find('_Tune')])
outputFileNames.append(dataset[1:dataset.find('_13TeV')])
outputFileNames.append(dataset.split('/')[1])
# get the one with the shortest filename
outputFile = sorted(outputFileNames, key=len)[0]
if isData:
outputFile = outputFile + '_' + config.Data.outputDatasetTag
if 'ext' in dataset:
extN = dataset[dataset.find('_ext')+4]
outputFile = outputFile+'_ext'+extN
if 'backup' in dataset:
outputFile = outputFile+'_backup'
storagePath=config.Data.outLFNDirBase+primaryDatasetName+'/'+config.Data.outputDatasetTag+'/'+'YYMMDD_hhmmss/0000/'+outputFile+'_999.root'
print 'will store (example):',storagePath
#print '\twhich has length:',len(storagePath)
if len(storagePath) > 255:
print
print 'we might have a problem with output path lengths too long (if we want to run crab over these).'
print 'example output will look like:'
print storagePath
print 'which has length:',len(storagePath)
print 'cowardly refusing to submit the jobs; exiting'
exit(-2)
else:
print
print 'will use storage path like:',storagePath
globalTag = ''
# for MC it will look like DYJetsToLL_M-100to200_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8__RunIISpring15MiniAODv2-74X_mcRun2_asymptotic_v2-v1
# so split to just get RunIISpring15MiniAODv2-74X_mcRun2_asymptotic_v2-v1
for datasetKey,tag in globalTagsByDataset.iteritems():
#print 'try to match:',datasetKey,'and',datasetNoSlashes.split('__')[1]
#print 'try to match:',datasetKey,'and',secondaryDatasetName
if re.match(re.compile(datasetKey),secondaryDatasetName):
globalTag = tag
if globalTag=='':
print 'ERROR: need global tag to proceed.'
exit(-5)
else:
print 'INFO: Overriding global tag to:',globalTag,'for dataset:',datasetName
# make cmssw cfg
cmsswCfgFile='lqCustomNano_{0}_{1}_{2}_NANO.py'.format(('data' if isData else 'mc'),year,globalTag)
cmsswCfgFullPath=cfgFilesDir+'/'+cmsswCfgFile
# if we already generated the cfg, don't do it again
if not os.path.isfile(cmsswCfgFullPath):
nanoScriptPath=os.getenv('CMSSW_BASE')+'/src/PhysicsTools/NanoAOD/test/doCmsDriver.py'
dataTypeArg='--datatype='+('data' if isData else 'mc')
gtArg='--gt='+globalTag
yearArg='--year='+str(year)
#print 'Creating CMSSW config file with cmsDriver: "{0} {1} {2} {3}"'.format(nanoScriptPath,dataTypeArg,gtArg,yearArg)
subprocess.check_call([nanoScriptPath,dataTypeArg,gtArg,yearArg])
print 'rename {0} --> {1}'.format(cmsswCfgFile,cmsswCfgFullPath)
os.rename(cmsswCfgFile,cmsswCfgFullPath)
else:
print 'Using already-generated cfg: {0}'.format(cmsswCfgFullPath)
with open(cmsswCfgFullPath,'r') as config_file:
config_txt = config_file.read()
newCmsswConfig = cfgFilesDir+'/'+datasetName+'_cmssw_cfg.py'
print 'INFO: Creating',newCmsswConfig,'...'
# substitute the output filename at the end
if isData:
config_txt += '\nprocess.NANOAODoutput.fileName = "'+outputFile+'.root"\n'
else:
config_txt += '\nprocess.NANOAODSIMoutput.fileName = "'+outputFile+'.root"\n'
with open(newCmsswConfig,'w') as cfgNew_file:
cfgNew_file.write(config_txt)
config.General.requestName = datasetName
config.JobType.psetName = newCmsswConfig
config.Data.totalUnits = nUnits
if options.jsonFile is not None:
if options.prevJsonFile is not None:
print 'Using the subtraction between previous json and new json; WARNING: if lumis changed from good in previous to bad in new json, this will not remove them'
from WMCore.DataStructs.LumiList import LumiList
prevJsonLumiList = LumiList(url=options.prevJsonFile) if 'http:' in options.prevJsonFile else LumiList(filename=options.prevJsonFile)
currentJsonLumiList = LumiList(url=options.jsonFile) if 'http:' in options.jsonFile else LumiList(filename=options.jsonFile)
newLumiList = currentJsonLumiList - prevJsonLumiList
newLumiList.writeJSON('newJSON_minus_oldJSON.json')
config.Data.lumiMask = 'newJSON_minus_oldJSON.json'
else:
config.Data.lumiMask = options.jsonFile
if options.runRange is not None:
config.Data.runRange = runRange
# and submit
print 'submit!'
#crabSubmit(config)
# workaround for cmssw multiple-loading problem
# submit in subprocess
q = Queue()
p = Process(target=crabSubmit, args=(config,))
p.start()
p.join()
if q.get()==-1:
exit(-1)
print 'Done!'
exit(0)