Skip to content

Commit

Permalink
Merge pull request cms-sw#110 from mmasciov/LocalDBSforPrivateMiniAOD
Browse files Browse the repository at this point in the history
Local db sfor private mini aod
  • Loading branch information
gpetruc committed Aug 27, 2014
2 parents 0a94f13 + 737220c commit ad129e9
Show file tree
Hide file tree
Showing 6 changed files with 302 additions and 13 deletions.
140 changes: 136 additions & 4 deletions CMGTools/Production/python/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@ def __str__(self):

class BaseDataset( object ):

def __init__(self, name, user, pattern='.*root', run_range=None):
### def __init__(self, name, user, pattern='.*root', run_range=None):
def __init__(self, name, user, pattern='.*root', run_range=None, dbsInstance=None):
self.name = name
self.user = user
self.pattern = pattern
self.run_range = run_range
### MM
self.dbsInstance = dbsInstance
### MM
self.primaryDatasetEntries = -1
self.report = None
self.buildListOfFiles( self.pattern )
Expand Down Expand Up @@ -323,8 +327,90 @@ def getPrimaryDatasetEntries(self):
return int(self.report.get('PrimaryDatasetEntries',-1))
return -1

def createDataset( user, dataset, pattern, readcache=False,
basedir=None, run_range = None):

### MM
class PrivateDataset ( BaseDataset ):

def __init__(self, name, dbsInstance=None):
super(PrivateDataset, self).__init__(name, 'PRIVATE', dbsInstance=dbsInstance)

def buildListOfFilesDBS(self, name, dbsInstance):
entries = self.findPrimaryDatasetNumFiles(name, dbsInstance, -1, -1)
files = []
dbs = 'das_client.py --query="file dataset=%s instance=prod/%s" --limit=%s' % (name, dbsInstance, entries)
dbsOut = os.popen(dbs)
for line in dbsOut:
if line.find('/store')==-1:
continue
line = line.rstrip()
# print 'line',line
files.append(line)
#return ['root://eoscms//eos/cms%s' % f for f in files]
return files

def buildListOfFiles(self, pattern='.*root'):
self.files = self.buildListOfFilesDBS(self.name, self.dbsInstance)


@staticmethod
def findPrimaryDatasetEntries(dataset, dbsInstance, runmin, runmax):

query, qwhat = dataset, "dataset"
if "#" in dataset: qwhat = "block"
if runmin >0 or runmax > 0:
if runmin == runmax:
query = "%s run=%d" % (query,runmin)
else:
print "WARNING: queries with run ranges are slow in DAS"
query = "%s run between [%d, %d]" % (query,runmin if runmin > 0 else 1, runmax if runmax > 0 else 999999)
dbs='das_client.py --query="summary %s=%s instance=prod/%s"'%(qwhat, query, dbsInstance)
dbsOut = os.popen(dbs).readlines()

entries = []
for line in dbsOut:
line = line.replace('\n','')
if "nevents" in line:
entries.append(int(line.split(":")[1]))
if entries:
return sum(entries)
return -1


@staticmethod
def findPrimaryDatasetNumFiles(dataset, dbsInstance, runmin, runmax):

query, qwhat = dataset, "dataset"
if "#" in dataset: qwhat = "block"
if runmin >0 or runmax > 0:
if runmin == runmax:
query = "%s run=%d" % (query,runmin)
else:
print "WARNING: queries with run ranges are slow in DAS"
query = "%s run between [%d, %d]" % (query,runmin if runmin > 0 else 1, runmax if runmax > 0 else 999999)
dbs='das_client.py --query="summary %s=%s instance=prod/%s"'%(qwhat, query, dbsInstance)
dbsOut = os.popen(dbs).readlines()

entries = []
for line in dbsOut:
line = line.replace('\n','')
if "nfiles" in line:
entries.append(int(line.split(":")[1]))
if entries:
return sum(entries)
return -1

def getPrimaryDatasetEntries(self):
runmin = -1
runmax = -1
if self.run_range is not None:
runmin = self.run_range[0]
runmax = self.run_range[1]
return self.findPrimaryDatasetEntries(self.name, self.dbsInstance, runmin, runmax)
### MM


def createDataset( user, dataset, pattern, readcache=False,
basedir = None, run_range = None):

cachedir = '/'.join( [os.environ['HOME'],'.cmgdataset'])

Expand Down Expand Up @@ -364,7 +450,7 @@ def readCache(data, user, pattern):
info = False
elif user == 'LOCAL':
data = LocalDataset( dataset, basedir, pattern)
info = False
info = False
else:
data = Dataset( dataset, user, pattern)
writeCache(data)
Expand All @@ -377,3 +463,49 @@ def readCache(data, user, pattern):
## else:
## data = Dataset( user, dataset, pattern )
return data

### MM
def createMyDataset( user, dataset, pattern, dbsInstance, readcache=False):

cachedir = '/'.join( [os.environ['HOME'],'.cmgdataset'])

def cacheFileName(data, user, dbsInstance, pattern):
cf = data.replace('/','_')
name = '{dir}/{user}%{dbsInstance}%{name}%{pattern}.pck'.format(
dir = cachedir,
user = user,
dbsInstance = dbsInstance,
name = cf,
pattern = pattern)
return name

def writeCache(dataset):
if not os.path.exists(cachedir):
os.mkdir(cachedir)
cachename = cacheFileName(dataset.name,
dataset.user,
dataset.dbsInstance,
dataset.pattern)
pckfile = open( cachename, 'w')
pickle.dump(dataset, pckfile)

def readCache(data, user, dbsInstance, pattern):
cachename = cacheFileName(data, user, dbsInstance, pattern)

pckfile = open( cachename)
dataset = pickle.load(pckfile)
#print 'reading cache'
return dataset

if readcache:
try:
data = readCache(dataset, user, dbsInstance, pattern)
except IOError:
readcache = False
if not readcache:
if user == 'PRIVATE':
data = PrivateDataset( dataset, dbsInstance )
info = False
writeCache(data)
return data
### MM
21 changes: 20 additions & 1 deletion CMGTools/Production/python/datasetToSource.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from CMGTools.Production.dataset import Dataset, CMSDataset, LocalDataset, createDataset
from CMGTools.Production.dataset import Dataset, CMSDataset, LocalDataset, createDataset, PrivateDataset, createMyDataset

import FWCore.ParameterSet.Config as cms

Expand All @@ -18,3 +18,22 @@ def datasetToSource( user, dataset, pattern='.*root', readCache=False):
source.fileNames.extend( data.listOfGoodFiles() )

return source

### MM
def myDatasetToSource( user, dataset, pattern='.*root', dbsInstance=None, readCache=False):

#print user, dataset, pattern, dbsInstance
data = createMyDataset(user, dataset, pattern, dbsInstance, readCache)

source = cms.Source(
"PoolSource",
noEventSort = cms.untracked.bool(True),
duplicateCheckMode = cms.untracked.string("noDuplicateCheck"),
fileNames = cms.untracked.vstring()
)

#print data.listOfGoodFiles()
source.fileNames.extend( data.listOfGoodFiles() )

return source
### MM
7 changes: 7 additions & 0 deletions CMGTools/Production/python/getMyFiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
def getMyFiles(dataset, user, pattern, dbsInstance):
from CMGTools.Production.datasetToSource import MyDatasetToSource
# print 'getting files for', dataset,user,pattern
ds = MyDatasetToSource( user, dataset, pattern, dbsInstance, True )
files = ds.fileNames
return ['root://eoscms//eos/cms%s' % f for f in files]

93 changes: 86 additions & 7 deletions CMGTools/TTHAnalysis/python/samples/ComponentCreator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import CMGTools.RootTools.fwlite.Config as cfg
from CMGTools.Production.datasetToSource import datasetToSource
from CMGTools.Production.datasetToSource import datasetToSource, myDatasetToSource
from CMGTools.Production.datasetInformation import DatasetInformation

class ComponentCreator(object):
Expand Down Expand Up @@ -37,6 +37,37 @@ def makePrivateMCComponent(self,name,dataset,files):

# print 'Skim Efficiency for ',name,'=', component.skimEfficiency
return component

### MM
def makeMyPrivateMCComponent(self,name,dataset,user,pattern,dbsInstance):

#entries = findMyPrimaryDatasetNumFiles(dataset, -1, -1)
#dbs = 'das_client.py --query="file dataset=%s instance=prod/phys03"' % dataset
#dbsOut = os.popen(dbs)
#files = []
#filesDBS = []
#for line in dbsOut:
# if line.find('/store')==-1:
# continue
# line = line.rstrip()
# # print 'line',line
# filesDBS.append(line)

component = cfg.MCComponent(
dataset=dataset,
name = name,
##files = ['root://eoscms//eos/cms%s' % f for f in filesDBS],
##files = self.getListOfFilesDBS(dataset, dbsInstance),
#files = getListOfFilesDBS(dataset, dbsInstance),
files = self.getMyFiles(dataset, user, pattern, dbsInstance),
xSection = 1,
nGenEvents = 1,
triggers = [],
effCorrFactor = 1,
)

return component
### MM

def makeDataComponent(self,name,datasets,user,pattern):
files=[]
Expand All @@ -56,16 +87,64 @@ def makeDataComponent(self,name,datasets,user,pattern):
return component


def getFiles(self,dataset, user, pattern):
def getFiles(self, dataset, user, pattern):
# print 'getting files for', dataset,user,pattern
ds = datasetToSource( user, dataset, pattern, True )
files = ds.fileNames
return ['root://eoscms//eos/cms%s' % f for f in files]

### MM
def getMyFiles(self, dataset, user, pattern, dbsInstance):
# print 'getting files for', dataset,user,pattern
ds = myDatasetToSource( user, dataset, pattern, dbsInstance, True )
files = ds.fileNames
return ['root://eoscms//eos/cms%s' % f for f in files]
### MM

def getSkimEfficiency(self,dataset,user):
info=DatasetInformation(dataset,user,'',False,False,'','','')
fraction=info.dataset_details['PrimaryDatasetFraction']
if fraction<0.001:
print 'ERROR FRACTION IS ONLY ',fraction
return fraction
info=DatasetInformation(dataset,user,'',False,False,'','','')
fraction=info.dataset_details['PrimaryDatasetFraction']
if fraction<0.001:
print 'ERROR FRACTION IS ONLY ',fraction
return fraction


#### MM
# ### Function to get files on DBS prod/phys03
# def getListOfFilesDBS(self, dataset, dbsInstance):
# entries = findMyPrimaryDatasetNumFiles(dataset, dbsInstance, -1, -1)
# #print entries
# filesDBS = []
# dbs = 'das_client.py --query="file dataset=%s instance=prod/%s" --limit=%s' % (dataset, dbsInstance, entries)
# dbsOut = os.popen(dbs)
# for line in dbsOut:
# if line.find('/store')==-1:
# continue
# line = line.rstrip()
# # print 'line',line
# filesDBS.append(line)
# return ['root://eoscms//eos/cms%s' % f for f in filesDBS]
#
#
#def findMyPrimaryDatasetNumFiles(dataset, dbsInstance, runmin, runmax):
#
# query, qwhat = dataset, "dataset"
# if "#" in dataset: qwhat = "block"
# if runmin >0 or runmax > 0:
# if runmin == runmax:
# query = "%s run=%d" % (query,runmin)
# else:
# print "WARNING: queries with run ranges are slow in DAS"
# query = "%s run between [%d, %d]" % (query,runmin if runmin > 0 else 1, runmax if runmax > 0 else 999999)
# dbs='das_client.py --query="summary %s=%s instance=prod/%s"'%(qwhat, query, dbsInstance)
# dbsOut = os.popen(dbs).readlines()
# entries = []
# for line in dbsOut:
# line = line.replace('\n','')
# if "nfiles" in line:
# entries.append(int(line.split(":")[1]))
# if entries:
# return sum(entries)
# return -1
#### MM

7 changes: 7 additions & 0 deletions CMGTools/TTHAnalysis/python/samples/getMyFiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
def getMyFiles(dataset, user, pattern, dbsInstance):
from CMGTools.Production.datasetToSource import MyDatasetToSource
# print 'getting files for', dataset,user,pattern
ds = MyDatasetToSource( user, dataset, pattern, dbsInstance, True )
files = ds.fileNames
return ['root://eoscms//eos/cms%s' % f for f in files]

Loading

0 comments on commit ad129e9

Please sign in to comment.