#!/usr/bin/python
# Usage : python dq2bulkreg.py [task id]
# Query the production database for datasets and lfns.
# Query the RLS for guids.
# Register the lfns and guids to datasets
#
# When modifying the code, be CAREFUL. The splitting
# of registration in subblocks makes it a bit intricate.
import getpass
import commands
import sys,re
import cx_Oracle
import math
from xml.dom import minidom
def printErrorMsg(msg,qry):
print msg
print
print qry
print
print '--- Failed. Clean up and try again.'
sys.exit(-1)
def getLFNandDS(dbuser,dbpasswd,dbname,taskfk):
print '... Getting lfns and dataset names from production database.'
files = []
# Define sql query.
sql1 = 'select jobname, controlstatus, currentstatus, joboutputs,'
sql1 += 'joblogs from ejobdef where taskfk=' + taskfk + ' order by jobname'
print '...',sql1
connection = cx_Oracle.connect("%s/%s@%s" % (dbuser, dbpasswd, dbname));
cur1 = connection.cursor()
cur1.execute(sql1)
entries = cur1.fetchall()
for entry in entries:
if (entry[2] == 'DONE'):
# Some jobs have had faulty xml
try:
xmlJobOutput = minidom.parseString(entry[3])
except:
tekst = "" + entry[3] + ""
xmlJobOutput = minidom.parseString(tekst)
try:
xmlJobLogs = minidom.parseString(entry[4])
except:
tekst = "" + entry[4] + ""
xmlJobLogs = minidom.parseString(tekst)
outputfiles = xmlJobOutput.getElementsByTagName ("joboutput")
outputfiles += xmlJobLogs.getElementsByTagName ("joblogs")
for file in outputfiles:
lfn = file.getElementsByTagName("lfn")[0].firstChild.data
# Use next line to register only one dataset
# if not re.search(r'log',lfn) : continue
ds = file.getElementsByTagName("dataset")[0].firstChild.data
files.append([lfn,ds])
cur1.close()
return files
def getGUIDs(RLSurl,files):
# Be aware that globus returns last guid first
print '... Getting guids from RLS.'
dq_guids = []
cmd = 'globus-rls-cli attribute bulk query dq_guid lfn '
for i in range(len(files)):
cmd += files[i][0] + ' '
cmd += RLSurl
query=commands.getstatusoutput(cmd)
if not query[0]==0:
print cmd
print
print '--- globus-rls-cli dq_guid query failed.'
print '--- No registration done.'
print query[1]
sys.exit(-1)
else:
guid_string = query[1].split()
for i in range(3,len(guid_string),4):
dq_guids.insert(0, guid_string[i])
return dq_guids
def register(files,dq_guids,DsLoc,dataset):
print '... Registering new datasets and adding files.'
cmd = 'dq2 registerFilesInDataset ' + dataset
i=0
for file in files:
if dataset != file[1]:
if i!=0:
#print cmd
qry=commands.getstatusoutput(cmd)
if not qry[0]==0: printErrorMsg(cmd,qry[1])
dataset = file[1]
newcmd = 'dq2 registerNewDataset ' + dataset
#print 'pr1 ', newcmd
qry=commands.getstatusoutput(newcmd)
if not qry[0]==0: printErrorMsg(newcmd,qry[1])
newcmd = 'dq2 registerDatasetLocation -i ' + dataset +' '+DsLoc
#print 'pr2 ', newcmd
qry=commands.getstatusoutput(newcmd)
#print 'pr3'
if not qry[0]==0:
#print 'pr3a'
printErrorMsg(newcmd,qry[1])
#print 'pr3d'
#print 'pr4'
cmd = 'dq2 registerFilesInDataset ' + dataset
cmd += ' ' + file[0] + ' ' + dq_guids[i]
i += 1
#print 'pr5', cmd
qry=commands.getstatusoutput(cmd)
#print 'pr6'
if not qry[0]==0: printErrorMsg(cmd,qry[1])
return dataset
# MAIN PROGRAM
if len(sys.argv[1:]) != 1:
print 'Usage : python dq2bulkreg.py [TASKID]'
sys.exit(-1)
RLSurl = 'rls://atlasrls.nordugrid.org:39281' # RLS Server
DSLoc = 'NDGFDISK' # Dataset Loacation
dbname = 'atlas_prodsys'
dbuser = 'atlas_ps_adhoc'
lblock = 500 # Max block size
dbpasswd = getpass.getpass('Database password:') # Get the database password
files = getLFNandDS(dbuser,dbpasswd,dbname,sys.argv[1]) # Query prodsys database
files.sort()
nblocks = math.ceil(len(files)/float(lblock))
rest=0
dataset=''
for i in range(nblocks):
if i==nblocks-1: rest=int(nblocks*lblock-len(files))
dq_guids = getGUIDs(RLSurl,files[i*lblock:(i+1)*lblock-rest]) # Query RLS
dataset = register(files[i*lblock:(i+1)*lblock-rest],dq_guids,DSLoc,dataset)