#!/usr/bin/python # Usage : python dq2bulkreg.py [task id] # Query the production database for datasets and lfns. # Query the RLS for guids. # Register the lfns and guids to datasets # # When modifying the code, be CAREFUL. The splitting # of registration in subblocks makes it a bit intricate. import getpass import commands import sys,re import cx_Oracle import math from xml.dom import minidom def printErrorMsg(msg,qry): print msg print print qry print print '--- Failed. Clean up and try again.' sys.exit(-1) def getLFNandDS(dbuser,dbpasswd,dbname,taskfk): print '... Getting lfns and dataset names from production database.' files = [] # Define sql query. sql1 = 'select jobname, controlstatus, currentstatus, joboutputs,' sql1 += 'joblogs from ejobdef where taskfk=' + taskfk + ' order by jobname' print '...',sql1 connection = cx_Oracle.connect("%s/%s@%s" % (dbuser, dbpasswd, dbname)); cur1 = connection.cursor() cur1.execute(sql1) entries = cur1.fetchall() for entry in entries: if (entry[2] == 'DONE'): # Some jobs have had faulty xml try: xmlJobOutput = minidom.parseString(entry[3]) except: tekst = "" + entry[3] + "" xmlJobOutput = minidom.parseString(tekst) try: xmlJobLogs = minidom.parseString(entry[4]) except: tekst = "" + entry[4] + "" xmlJobLogs = minidom.parseString(tekst) outputfiles = xmlJobOutput.getElementsByTagName ("joboutput") outputfiles += xmlJobLogs.getElementsByTagName ("joblogs") for file in outputfiles: lfn = file.getElementsByTagName("lfn")[0].firstChild.data # Use next line to register only one dataset # if not re.search(r'log',lfn) : continue ds = file.getElementsByTagName("dataset")[0].firstChild.data files.append([lfn,ds]) cur1.close() return files def getGUIDs(RLSurl,files): # Be aware that globus returns last guid first print '... Getting guids from RLS.' dq_guids = [] cmd = 'globus-rls-cli attribute bulk query dq_guid lfn ' for i in range(len(files)): cmd += files[i][0] + ' ' cmd += RLSurl query=commands.getstatusoutput(cmd) if not query[0]==0: print cmd print print '--- globus-rls-cli dq_guid query failed.' print '--- No registration done.' print query[1] sys.exit(-1) else: guid_string = query[1].split() for i in range(3,len(guid_string),4): dq_guids.insert(0, guid_string[i]) return dq_guids def register(files,dq_guids,DsLoc,dataset): print '... Registering new datasets and adding files.' cmd = 'dq2 registerFilesInDataset ' + dataset i=0 for file in files: if dataset != file[1]: if i!=0: #print cmd qry=commands.getstatusoutput(cmd) if not qry[0]==0: printErrorMsg(cmd,qry[1]) dataset = file[1] newcmd = 'dq2 registerNewDataset ' + dataset #print 'pr1 ', newcmd qry=commands.getstatusoutput(newcmd) if not qry[0]==0: printErrorMsg(newcmd,qry[1]) newcmd = 'dq2 registerDatasetLocation -i ' + dataset +' '+DsLoc #print 'pr2 ', newcmd qry=commands.getstatusoutput(newcmd) #print 'pr3' if not qry[0]==0: #print 'pr3a' printErrorMsg(newcmd,qry[1]) #print 'pr3d' #print 'pr4' cmd = 'dq2 registerFilesInDataset ' + dataset cmd += ' ' + file[0] + ' ' + dq_guids[i] i += 1 #print 'pr5', cmd qry=commands.getstatusoutput(cmd) #print 'pr6' if not qry[0]==0: printErrorMsg(cmd,qry[1]) return dataset # MAIN PROGRAM if len(sys.argv[1:]) != 1: print 'Usage : python dq2bulkreg.py [TASKID]' sys.exit(-1) RLSurl = 'rls://atlasrls.nordugrid.org:39281' # RLS Server DSLoc = 'NDGFDISK' # Dataset Loacation dbname = 'atlas_prodsys' dbuser = 'atlas_ps_adhoc' lblock = 500 # Max block size dbpasswd = getpass.getpass('Database password:') # Get the database password files = getLFNandDS(dbuser,dbpasswd,dbname,sys.argv[1]) # Query prodsys database files.sort() nblocks = math.ceil(len(files)/float(lblock)) rest=0 dataset='' for i in range(nblocks): if i==nblocks-1: rest=int(nblocks*lblock-len(files)) dq_guids = getGUIDs(RLSurl,files[i*lblock:(i+1)*lblock-rest]) # Query RLS dataset = register(files[i*lblock:(i+1)*lblock-rest],dq_guids,DSLoc,dataset)