diff --git a/.secrets.baseline b/.secrets.baseline index 885863c08..8a0c98e39 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": "poetry.lock", "lines": null }, - "generated_at": "2021-04-16T20:42:51Z", + "generated_at": "2021-04-28T19:37:37Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -176,37 +176,37 @@ { "hashed_secret": "96c9184fb19c9c1618ccf44d141f8029a739891c", "is_verified": false, - "line_number": 115, + "line_number": 121, "type": "Hex High Entropy String" }, { "hashed_secret": "e1da93616713812cb50e0ac845b1e9e305d949f1", "is_verified": false, - "line_number": 311, + "line_number": 317, "type": "Hex High Entropy String" }, { "hashed_secret": "47f42f4c34fddab383b817e689dc0fb75af81266", "is_verified": false, - "line_number": 335, + "line_number": 341, "type": "Hex High Entropy String" }, { "hashed_secret": "300d95dd5d30ab6928ffda6c08c6a129a23e5b39", "is_verified": false, - "line_number": 359, + "line_number": 365, "type": "Hex High Entropy String" }, { "hashed_secret": "f9e664db75c7f23a299b0b055c10e08d47073e93", "is_verified": false, - "line_number": 421, + "line_number": 427, "type": "Hex High Entropy String" }, { "hashed_secret": "7c35c215b326b9463b669b657c1ff9873ff53d9a", "is_verified": false, - "line_number": 446, + "line_number": 452, "type": "Hex High Entropy String" } ] diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle index 86d298882..9091fac8d 100644 Binary files a/docs/_build/doctrees/environment.pickle and b/docs/_build/doctrees/environment.pickle differ diff --git a/docs/_build/doctrees/tools/indexing.doctree b/docs/_build/doctrees/tools/indexing.doctree index 83439ab44..ed1398bc3 100644 Binary files a/docs/_build/doctrees/tools/indexing.doctree and b/docs/_build/doctrees/tools/indexing.doctree differ diff --git a/docs/_build/doctrees/tools/metadata.doctree b/docs/_build/doctrees/tools/metadata.doctree index 551d98720..a79943d0e 100644 Binary files a/docs/_build/doctrees/tools/metadata.doctree and b/docs/_build/doctrees/tools/metadata.doctree differ diff --git a/docs/_build/html/searchindex.js b/docs/_build/html/searchindex.js index b824d225f..1562db9e5 100644 --- a/docs/_build/html/searchindex.js +++ b/docs/_build/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({docnames:["auth","file","index","indexing","jobs","metadata","query","submission","tools","tools/indexing","tools/metadata","wss"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":3,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":2,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.viewcode":1,sphinx:56},filenames:["auth.rst","file.rst","index.rst","indexing.rst","jobs.rst","metadata.rst","query.rst","submission.rst","tools.rst","tools/indexing.rst","tools/metadata.rst","wss.rst"],objects:{"gen3.auth":{Gen3Auth:[0,0,1,""]},"gen3.auth.Gen3Auth":{curl:[0,1,1,""],get_access_token:[0,1,1,""],refresh_access_token:[0,1,1,""]},"gen3.file":{Gen3File:[1,0,1,""]},"gen3.file.Gen3File":{get_presigned_url:[1,1,1,""]},"gen3.index":{Gen3Index:[3,0,1,""]},"gen3.index.Gen3Index":{async_create_record:[3,1,1,""],async_get_record:[3,1,1,""],async_get_records_on_page:[3,1,1,""],async_get_with_params:[3,1,1,""],async_query_urls:[3,1,1,""],async_update_record:[3,1,1,""],create_blank:[3,1,1,""],create_new_version:[3,1,1,""],create_record:[3,1,1,""],delete_record:[3,1,1,""],get:[3,1,1,""],get_all_records:[3,1,1,""],get_latest_version:[3,1,1,""],get_record:[3,1,1,""],get_record_doc:[3,1,1,""],get_records:[3,1,1,""],get_records_on_page:[3,1,1,""],get_stats:[3,1,1,""],get_urls:[3,1,1,""],get_version:[3,1,1,""],get_versions:[3,1,1,""],get_with_params:[3,1,1,""],is_healthy:[3,1,1,""],query_urls:[3,1,1,""],update_blank:[3,1,1,""],update_record:[3,1,1,""]},"gen3.jobs":{Gen3Jobs:[4,0,1,""]},"gen3.jobs.Gen3Jobs":{async_run_job_and_wait:[4,1,1,""],create_job:[4,1,1,""],get_output:[4,1,1,""],get_status:[4,1,1,""],get_version:[4,1,1,""],is_healthy:[4,1,1,""],list_jobs:[4,1,1,""]},"gen3.metadata":{Gen3Metadata:[5,0,1,""]},"gen3.metadata.Gen3Metadata":{"delete":[5,1,1,""],async_create:[5,1,1,""],async_get:[5,1,1,""],async_update:[5,1,1,""],auth_provider:[5,2,1,""],batch_create:[5,1,1,""],create:[5,1,1,""],create_index_key_path:[5,1,1,""],delete_index_key_path:[5,1,1,""],endpoint:[5,2,1,""],get:[5,1,1,""],get_index_key_paths:[5,1,1,""],get_version:[5,1,1,""],is_healthy:[5,1,1,""],query:[5,1,1,""],update:[5,1,1,""]},"gen3.query":{Gen3Query:[6,0,1,""]},"gen3.query.Gen3Query":{graphql_query:[6,1,1,""],query:[6,1,1,""],raw_data_download:[6,1,1,""]},"gen3.submission":{Gen3Submission:[7,0,1,""]},"gen3.submission.Gen3Submission":{create_program:[7,1,1,""],create_project:[7,1,1,""],delete_node:[7,1,1,""],delete_nodes:[7,1,1,""],delete_program:[7,1,1,""],delete_project:[7,1,1,""],delete_record:[7,1,1,""],delete_records:[7,1,1,""],export_node:[7,1,1,""],export_record:[7,1,1,""],get_dictionary_all:[7,1,1,""],get_dictionary_node:[7,1,1,""],get_graphql_schema:[7,1,1,""],get_programs:[7,1,1,""],get_project_dictionary:[7,1,1,""],get_project_manifest:[7,1,1,""],get_projects:[7,1,1,""],open_project:[7,1,1,""],query:[7,1,1,""],submit_file:[7,1,1,""],submit_record:[7,1,1,""]},"gen3.tools.indexing":{download_manifest:[9,3,0,"-"],index_manifest:[9,3,0,"-"],verify_manifest:[9,3,0,"-"]},"gen3.tools.indexing.download_manifest":{CURRENT_DIR:[9,2,1,""],INDEXD_RECORD_PAGE_SIZE:[9,2,1,""],MAX_CONCURRENT_REQUESTS:[9,2,1,""],TMP_FOLDER:[9,2,1,""],async_download_object_manifest:[9,4,1,""]},"gen3.tools.indexing.index_manifest":{ACLS:[9,2,1,""],AUTHZ:[9,2,1,""],CURRENT_DIR:[9,2,1,""],GUID:[9,2,1,""],MD5:[9,2,1,""],PREV_GUID:[9,2,1,""],SIZE:[9,2,1,""],ThreadControl:[9,0,1,""],URLS:[9,2,1,""],get_and_verify_fileinfos_from_manifest:[9,4,1,""],get_and_verify_fileinfos_from_tsv_manifest:[9,4,1,""],index_object_manifest:[9,4,1,""]},"gen3.tools.indexing.verify_manifest":{CURRENT_DIR:[9,2,1,""],MAX_CONCURRENT_REQUESTS:[9,2,1,""],async_verify_object_manifest:[9,4,1,""]},"gen3.tools.metadata":{ingest_manifest:[10,3,0,"-"]},"gen3.tools.metadata.ingest_manifest":{COLUMN_TO_USE_AS_GUID:[10,2,1,""],GUID_TYPE_FOR_INDEXED_FILE_OBJECT:[10,2,1,""],GUID_TYPE_FOR_NON_INDEXED_FILE_OBJECT:[10,2,1,""],MAX_CONCURRENT_REQUESTS:[10,2,1,""],async_ingest_metadata_manifest:[10,4,1,""],async_query_urls_from_indexd:[10,4,1,""]},"gen3.wss":{Gen3WsStorage:[11,0,1,""]},"gen3.wss.Gen3WsStorage":{copy:[11,1,1,""],download:[11,1,1,""],download_url:[11,1,1,""],ls:[11,1,1,""],ls_path:[11,1,1,""],rm:[11,1,1,""],rm_path:[11,1,1,""],upload:[11,1,1,""],upload_url:[11,1,1,""]},gen3:{tools:[8,3,0,"-"]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","attribute","Python attribute"],"3":["py","module","Python module"],"4":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:attribute","3":"py:module","4":"py:function"},terms:{"0420947":[],"0938203":[],"0939903":[],"0a80fada010c":9,"0a80fada096c":9,"0a80fada097c":9,"0a80fada098c":9,"0a80fada099c":9,"100":[7,9],"11e9":9,"1616009968":[],"1616010780":[],"1616010781":[],"1616018689":[],"1617985382":[],"1617997157":[],"1618604466":[],"1618606085":[],"1619452575":9,"1619452576":10,"2280114":[],"255e396f":9,"2889535":[],"333":5,"343434344":9,"363455714":9,"3910377":[],"3926728":10,"450c":9,"4714":7,"473d83400bc1bc9dc635e334fadd433c":9,"473d83400bc1bc9dc635e334faddd33c":9,"473d83400bc1bc9dc635e334fadde33c":9,"473d83400bc1bc9dc635e334faddf33c":9,"4904246":[],"543434443":9,"5495481":[],"5964222":[],"641011":[],"6572418":[],"6f90":7,"7461076":[],"7649791":[],"7d3d8d2083b4":9,"8420":7,"9159722":[],"93d9af72":9,"9644923":9,"9a07":9,"boolean":3,"class":[0,2,9,11],"default":[0,6,7,9,10],"export":7,"function":[2,3,4,5,8,9,10],"import":9,"int":[3,5,6,7,9,10],"new":[0,3],"public":[3,5],"return":[0,3,4,5,6,7,9],"true":[3,4,5,6,7,9,10],"while":[0,1,3,4,5,6,7,11],But:5,For:[1,5,6,7,8,9],NOT:10,One:6,Such:8,That:3,The:[0,1,2,3,5,7,9],There:9,These:8,Used:9,Will:[4,6],_get_acl_from_row:9,_get_authz_from_row:9,_get_file_name_from_row:9,_get_file_size_from_row:9,_get_guid_for_row:10,_get_guid_from_row:9,_get_md5_from_row:9,_get_urls_from_row:9,_guid_typ:10,_query_for_associated_indexd_record_guid:10,_ssl:[3,4,5],a5c6:9,ab167e49d25b488939b1ede42752458b:3,about:[2,3],abov:9,access:[0,1,3,6],accesstoken:0,acl:[3,9],across:9,action:[8,9],actual:9,add:[3,5],added:3,addit:3,admin:[5,9],admin_endpoint_suffix:5,against:[3,6,7,9,10],algorithm:3,alia:3,aliv:6,all:[3,4,5,6,7,9,10],allow:[7,9,10],along:2,alreadi:8,altern:9,alwai:5,ammount:10,amount:8,ani:[5,9],anoth:5,api:[5,7,9],api_kei:9,appli:6,appropri:11,arbitrari:0,argument:[0,11],arrai:7,asc:6,assign:8,associ:[3,5],assum:9,async:[4,8,9,10],async_cr:5,async_create_record:3,async_download_object_manifest:9,async_get:5,async_get_record:3,async_get_records_on_pag:3,async_get_with_param:3,async_ingest_metadata_manifest:10,async_query_url:3,async_query_urls_from_indexd:10,async_run_job_and_wait:4,async_upd:5,async_update_record:3,async_verify_object_manifest:9,asynchron:[3,4,5],asyncio:[9,10],attach:[3,5],attribut:9,auth:[1,2,3,4,5,6,7,9,10,11],auth_provid:[1,3,4,5,6,7,11],authbas:0,authent:0,authz:[0,3,9],auto:[0,2],automat:0,avail:[1,2],b0f1:9,base:[0,1,3,4,5,6,7,8,9,11],baseid:3,basic:[3,9,10],batch_creat:5,batch_siz:7,behavior:9,belong:7,below:9,blank:3,blob:[5,6],bodi:3,bool:[4,5,7,9,10],broad:8,broken:8,call:11,can:[0,3,4,7,9,10],capabl:8,categori:8,ccle:7,ccle_one_record:7,ccle_sample_nod:7,cdi:6,chang:[3,9],chunk_siz:7,client:3,code:[2,7],column:[9,10],column_to_use_as_guid:10,com:6,comma:9,common:[0,1,3,4,5,6,7,8,9,10,11],commons_url:[9,10],complet:4,complex:6,concat:9,concurr:[9,10],connect:10,consist:3,constructor:0,contain:[0,2,5,7,8,9,10],content:[3,11],control:3,copi:11,coroutin:9,correspond:3,crdc:0,creat:[3,4,5,7,9],create_blank:3,create_index_key_path:5,create_job:4,create_new_vers:3,create_program:7,create_project:7,create_record:3,cred:3,credenti:[0,1,3,4,5,6,7,9,11],csv:[7,9,10],curl:0,current:7,current_dir:9,custom:9,d70b41b9:7,data:[0,3,5,6,7],data_spreadsheet:7,data_typ:6,databas:5,dbgap:10,dcf:7,def:9,defin:[5,7],delai:4,delet:[0,3,5,7],delete_index_key_path:5,delete_nod:7,delete_program:7,delete_project:7,delete_record:[3,7],delimet:[9,10],delimit:9,demograph:7,desir:9,dest_path:11,dest_urlstr:11,dest_w:11,dest_wskei:11,detail:[2,6],determin:[9,10],dev:9,dict:[3,4,5,9,10],dictionari:[3,4,5,6,7],did:3,differ:5,directori:9,disk:11,dispatch:4,dist_resolut:3,distribut:3,doc:6,docstr:2,document:3,doe:[0,10],domain:[9,10],done:4,download:[0,1,2,3,4,5,6,7,8,11],download_manifest:9,download_url:11,e043ab8b77b9:7,each:[3,7,9],effici:8,either:7,elasticsearch:6,els:[0,10],elsewher:10,empti:7,end:5,endpoint:[0,1,3,4,5,6,7,11],entir:7,entri:3,env:0,environ:0,equal:6,error:[9,10],error_nam:9,etc:7,everi:[8,9],exampl:[0,1,3,4,5,6,7,9,11],exclud:3,execut:[6,7],exist:[3,5,8,10],expect:[5,8,9],experi:7,expir:0,export_nod:7,export_record:7,extent:9,f1f8:9,fail:7,fals:[3,5,9],featur:1,fenc:0,field:[3,5,6,9,10],fieldnam:9,file:[0,2,3,4,7,8,9,10,11],file_nam:[3,9],file_s:9,file_st:3,fileformat:7,filenam:[0,7,9,10],fill:10,filter:[5,6],filter_object:6,first:[6,7],flag:9,folder:9,follow:[0,9],form:11,format:[3,5,7,9],from:[0,1,2,3,4,5,6,7,8,9,10,11],func_to_parse_row:[9,10],gen3:[9,10],gen3_api_kei:0,gen3auth:[0,1,3,4,5,6,7,9,10,11],gen3fil:1,gen3index:3,gen3job:4,gen3metadata:5,gen3queri:6,gen3submiss:7,gen3wsstorag:11,gener:[0,1,2,3,4,5,6,7,11],get:[0,1,3,4,5,7,9,10,11],get_access_token:0,get_all_record:3,get_and_verify_fileinfos_from_manifest:9,get_and_verify_fileinfos_from_tsv_manifest:9,get_dictionary_al:7,get_dictionary_nod:7,get_graphql_schema:7,get_guid_from_fil:10,get_index_key_path:5,get_latest_vers:3,get_output:4,get_presigned_url:1,get_program:7,get_project:7,get_project_dictionari:7,get_project_manifest:7,get_record:3,get_record_doc:3,get_records_on_pag:3,get_stat:3,get_statu:4,get_url:3,get_vers:[3,4,5],get_with_param:3,giangb:9,github:[2,6],give:1,given:[0,3,4,5,7,10,11],global:4,good:3,graph:7,graphql:[6,7],graphql_queri:6,group:3,guid:[1,3,5,9,10],guid_exampl:9,guid_for_row:10,guid_from_fil:10,guid_type_for_indexed_file_object:10,guid_type_for_non_indexed_file_object:10,guppi:6,handl:3,has:9,has_vers:3,hash:[3,9],hash_typ:3,have:[5,9],header:9,healthi:[3,4,5],help:9,helper:2,hit:9,how:[7,9],http:[1,6,9,10],idea:3,identifi:[3,8],idp:0,ids:3,immut:3,implement:0,includ:[0,3],include_additional_column:9,index:[0,2,5,8],index_manifest:9,index_object_manifest:9,indexd:[3,9,10],indexd_field:[9,10],indexd_record_page_s:9,indexed_file_object_guid:10,indic:[0,9],info:[3,9],inform:[2,3],ingest:[2,8],ingest_manifest:10,initi:0,input:[4,9],instal:[0,2],instanc:[1,3,6,7,8],instead:6,integ:[3,7],interact:[1,3,4,5,7,11],interpret:0,introspect:7,involv:8,is_healthi:[3,4,5],is_indexed_file_object:10,its:3,job:2,job_id:4,job_input:4,job_nam:4,json:[0,1,3,4,5,6,7,9,11],just:[5,9,10],jwt:0,kei:[0,3,5,11],kwarg:[4,5],larg:8,latest:3,least:3,librari:9,like:[3,5,8,9,10],limit:[1,3,5,10],linear:4,list:[3,4,5,6,7,9,11],list_job:4,live:[9,10],local:[0,11],lock:10,log:[7,9,10],logic:[5,10],loop:9,ls_path:11,made:3,mai:8,make:[8,9],manag:[1,5],mani:[7,9],manifest:[7,8,9,10],manifest_fil:[9,10],manifest_file_delimit:[9,10],manifest_row_pars:[9,10],map:[0,9],mark:7,master:6,match:[3,5,10],max:5,max_concurrent_request:[9,10],max_tri:7,maximum:[9,10],md5:[3,9],md5_hash:9,mds:[5,10],mean:7,mechan:3,metadata:[2,3,8],metadata_list:5,metadata_sourc:10,metadata_typ:10,metdata:10,method:6,minut:0,mode:6,modul:[2,9],more:[2,5,6,8],most:8,mostli:2,multipl:[7,9],must:5,my_field:6,my_index:6,my_program:6,my_project:6,name:[3,4,7,9,10,11],namespac:10,necessari:[3,5],need:[3,6,9],nest:5,net:9,node:7,node_nam:7,node_typ:7,none:[0,1,3,4,5,6,7,9,10,11],note:[3,9,10],noth:3,now:[1,7],num:5,num_process:9,num_total_fil:9,number:[3,6,7,9,10],object:[1,3,4,5,6,7,8,9,11],off:5,offset:[5,6],old:3,one:[3,5,9],onli:[3,5,6,7],open:[7,9],open_project:7,opt:0,option:[0,1,3,4,5,6,7,9],order:[0,7],ordered_node_list:7,output:[4,5,9,10],output_filenam:[9,10],overrid:[9,10],overwrit:5,page:[0,1,2,3,4,5,6,7,9,11],pagin:3,parallel:9,param:[3,7],paramet:[0,1,3,4,5,6,7,9,10,11],pars:[9,10,11],parser:[9,10],pass:[0,6,7],password:[9,10],path:[0,5,9,11],path_to_manifest:9,pattern:[3,10],pdcdatastor:9,per:[9,10],peregrin:7,persist:8,phs0001:9,phs0002:9,pick:1,pla:9,place:9,planx:9,point:[0,1,3,4,5,6,7,11],popul:10,posit:6,post:[0,9],presign:1,prev_guid:9,previou:[3,9],previous:4,print:7,process:9,processed_fil:9,profil:[0,1,3,4,5,6,7,11],program:[7,9],progress:7,project:[7,9],project_id:[6,7],protocol:1,provid:[0,3,5,6,7,10],put:0,python:[2,8,9],queri:[1,2,3,5,7,10],query_str:6,query_txt:[6,7],query_url:3,quickstart:2,rather:0,raw:[6,9],raw_data_download:6,rbac:3,read:[3,5],readm:2,record:[3,5,6,7,9,10],refresh:0,refresh_access_token:0,refresh_fil:[0,1,3,4,5,6,7,11],refresh_token:0,regist:7,regular:6,relat:8,remov:11,replac:9,replace_url:9,repo:2,repres:[3,5],represent:3,request:[0,3,7,9,10],respect:6,respons:[0,3,4],result:7,retri:7,retriev:[1,7,10],return_full_metadata:5,rev:3,revers:7,revis:3,right:1,rm_path:11,root:[9,10],row:[6,7,9,10],row_offset:7,run:7,safe:9,same:[5,9,11],sampl:7,sandbox:[0,1,3,4,5,6,7,11],schema:7,screen:7,script:2,search:[0,2,3],second:4,see:[6,9],semaphon:10,semaphor:10,separ:9,servic:[1,3,4,5,7,10,11],service_loc:[3,4,5],session:9,set:0,setup:2,sheepdog:7,should:[7,9],shown:9,signpost:3,simpl:3,simpli:9,sinc:3,singl:7,size:[3,9],skip:7,sleep:4,some:[0,2],sort:6,sort_field:6,sort_object:6,sourc:[0,1,2,3,4,5,6,7,9,10,11],space:9,specif:[5,7,9,10],specifi:[0,3,11],spreadsheet:7,src_path:11,src_urlstr:11,src_w:11,src_wskei:11,ssl:[3,4,5],start:[3,4,6,7],statu:4,storag:2,store:3,str:[0,1,3,4,5,6,7,9,10],string:[0,3,5,9,11],strip:9,sub:7,subject:[6,7],submiss:2,submit:7,submit_fil:7,submit_record:7,submitter_id:6,suffici:3,suppli:3,support:[0,1,5,7,9],synchron:9,syntax:6,system:[6,7,8],tab:9,task:8,temporari:9,test1:9,test2:9,test3:9,test4:9,test5:9,test:9,text:[6,7],than:[0,5],thei:0,them:9,thi:[0,1,2,3,4,5,6,7,9,10,11],those:9,thread:9,thread_num:9,threadcontrol:9,through:[7,9],tier:6,time:[1,7,9],tmp_folder:9,token:0,tool:2,total:9,treat:5,tsv:[7,9,10],tupl:[3,9,10],type:[3,4,5,6,7,9,10],unaccess:6,under:[0,7,11],until:4,updat:[3,5,9],update_blank:3,update_record:3,upload:[3,7,11],upload_url:11,url:[1,3,8,9,10,11],urls_metadata:3,usag:9,use:[0,1,3,4,5,6,9,10],used:[5,10],user:[0,10],using:[0,1,3,4,5,6,7,9,11],usual:10,util:8,uuid1:7,uuid2:7,uuid:[3,7],valid:6,valu:[0,3,5,6,9],value_from_indexd:9,value_from_manifest:9,variabl:[0,6,7],variou:2,verbos:[6,7],verif:9,verifi:[2,8],verify_manifest:9,verify_object_manifest:9,version:[3,4,5],vital_statu:6,wait:4,want:[3,7],web:0,what:5,when:[0,3,6,10],where:[3,5,9,10],whether:[3,4,5,7,10],which:7,whose:5,within:[0,2,8],without:[3,5],won:5,work:0,workaround:9,worksheet:7,workspac:[0,2],wrapper:9,write:9,ws_urlstr:11,wskei:11,wss:11,wts:0,xlsx:7,you:[3,7,9]},titles:["Gen3 Auth Helper","Gen3 File Class","Welcome to Gen3 SDK\u2019s documentation!","Gen3 Index Class","Gen3 Jobs Class","Gen3 Metadata Class","Gen3 Query Class","Gen3 Submission Class","Gen3 Tools","Indexing Tools","Metadata Tools","Gen3 Workspace Storage"],titleterms:{"class":[1,3,4,5,6,7],auth:0,document:2,download:9,file:1,gen3:[0,1,2,3,4,5,6,7,8,11],helper:0,index:[3,9],indic:2,ingest:10,job:4,metadata:[5,10],queri:6,sdk:2,storag:11,submiss:7,tabl:2,tool:[8,9,10],verifi:9,welcom:2,workspac:11}}) \ No newline at end of file +Search.setIndex({docnames:["auth","file","index","indexing","jobs","metadata","query","submission","tools","tools/indexing","tools/metadata","wss"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":3,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":2,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.viewcode":1,sphinx:56},filenames:["auth.rst","file.rst","index.rst","indexing.rst","jobs.rst","metadata.rst","query.rst","submission.rst","tools.rst","tools/indexing.rst","tools/metadata.rst","wss.rst"],objects:{"gen3.auth":{Gen3Auth:[0,0,1,""]},"gen3.auth.Gen3Auth":{curl:[0,1,1,""],get_access_token:[0,1,1,""],refresh_access_token:[0,1,1,""]},"gen3.file":{Gen3File:[1,0,1,""]},"gen3.file.Gen3File":{get_presigned_url:[1,1,1,""]},"gen3.index":{Gen3Index:[3,0,1,""]},"gen3.index.Gen3Index":{async_create_record:[3,1,1,""],async_get_record:[3,1,1,""],async_get_records_on_page:[3,1,1,""],async_get_with_params:[3,1,1,""],async_query_urls:[3,1,1,""],async_update_record:[3,1,1,""],create_blank:[3,1,1,""],create_new_version:[3,1,1,""],create_record:[3,1,1,""],delete_record:[3,1,1,""],get:[3,1,1,""],get_all_records:[3,1,1,""],get_latest_version:[3,1,1,""],get_record:[3,1,1,""],get_record_doc:[3,1,1,""],get_records:[3,1,1,""],get_records_on_page:[3,1,1,""],get_stats:[3,1,1,""],get_urls:[3,1,1,""],get_version:[3,1,1,""],get_versions:[3,1,1,""],get_with_params:[3,1,1,""],is_healthy:[3,1,1,""],query_urls:[3,1,1,""],update_blank:[3,1,1,""],update_record:[3,1,1,""]},"gen3.jobs":{Gen3Jobs:[4,0,1,""]},"gen3.jobs.Gen3Jobs":{async_run_job_and_wait:[4,1,1,""],create_job:[4,1,1,""],get_output:[4,1,1,""],get_status:[4,1,1,""],get_version:[4,1,1,""],is_healthy:[4,1,1,""],list_jobs:[4,1,1,""]},"gen3.metadata":{Gen3Metadata:[5,0,1,""]},"gen3.metadata.Gen3Metadata":{"delete":[5,1,1,""],async_create:[5,1,1,""],async_get:[5,1,1,""],async_update:[5,1,1,""],auth_provider:[5,2,1,""],batch_create:[5,1,1,""],create:[5,1,1,""],create_index_key_path:[5,1,1,""],delete_index_key_path:[5,1,1,""],endpoint:[5,2,1,""],get:[5,1,1,""],get_index_key_paths:[5,1,1,""],get_version:[5,1,1,""],is_healthy:[5,1,1,""],query:[5,1,1,""],update:[5,1,1,""]},"gen3.query":{Gen3Query:[6,0,1,""]},"gen3.query.Gen3Query":{graphql_query:[6,1,1,""],query:[6,1,1,""],raw_data_download:[6,1,1,""]},"gen3.submission":{Gen3Submission:[7,0,1,""]},"gen3.submission.Gen3Submission":{create_program:[7,1,1,""],create_project:[7,1,1,""],delete_node:[7,1,1,""],delete_nodes:[7,1,1,""],delete_program:[7,1,1,""],delete_project:[7,1,1,""],delete_record:[7,1,1,""],delete_records:[7,1,1,""],export_node:[7,1,1,""],export_record:[7,1,1,""],get_dictionary_all:[7,1,1,""],get_dictionary_node:[7,1,1,""],get_graphql_schema:[7,1,1,""],get_programs:[7,1,1,""],get_project_dictionary:[7,1,1,""],get_project_manifest:[7,1,1,""],get_projects:[7,1,1,""],open_project:[7,1,1,""],query:[7,1,1,""],submit_file:[7,1,1,""],submit_record:[7,1,1,""]},"gen3.tools.indexing":{download_manifest:[9,3,0,"-"],index_manifest:[9,3,0,"-"],verify_manifest:[9,3,0,"-"]},"gen3.tools.indexing.download_manifest":{CURRENT_DIR:[9,2,1,""],INDEXD_RECORD_PAGE_SIZE:[9,2,1,""],MAX_CONCURRENT_REQUESTS:[9,2,1,""],TMP_FOLDER:[9,2,1,""],async_download_object_manifest:[9,4,1,""]},"gen3.tools.indexing.index_manifest":{ACLS:[9,2,1,""],AUTHZ:[9,2,1,""],CURRENT_DIR:[9,2,1,""],GUID:[9,2,1,""],MD5:[9,2,1,""],PREV_GUID:[9,2,1,""],SIZE:[9,2,1,""],ThreadControl:[9,0,1,""],URLS:[9,2,1,""],get_and_verify_fileinfos_from_manifest:[9,4,1,""],get_and_verify_fileinfos_from_tsv_manifest:[9,4,1,""],index_object_manifest:[9,4,1,""]},"gen3.tools.indexing.verify_manifest":{CURRENT_DIR:[9,2,1,""],MAX_CONCURRENT_REQUESTS:[9,2,1,""],async_verify_object_manifest:[9,4,1,""]},"gen3.tools.metadata":{ingest_manifest:[10,3,0,"-"]},"gen3.tools.metadata.ingest_manifest":{COLUMN_TO_USE_AS_GUID:[10,2,1,""],GUID_TYPE_FOR_INDEXED_FILE_OBJECT:[10,2,1,""],GUID_TYPE_FOR_NON_INDEXED_FILE_OBJECT:[10,2,1,""],MAX_CONCURRENT_REQUESTS:[10,2,1,""],async_ingest_metadata_manifest:[10,4,1,""],async_query_urls_from_indexd:[10,4,1,""]},"gen3.wss":{Gen3WsStorage:[11,0,1,""]},"gen3.wss.Gen3WsStorage":{copy:[11,1,1,""],download:[11,1,1,""],download_url:[11,1,1,""],ls:[11,1,1,""],ls_path:[11,1,1,""],rm:[11,1,1,""],rm_path:[11,1,1,""],upload:[11,1,1,""],upload_url:[11,1,1,""]},gen3:{tools:[8,3,0,"-"]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","attribute","Python attribute"],"3":["py","module","Python module"],"4":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:attribute","3":"py:module","4":"py:function"},terms:{"0420947":[],"0938203":[],"0939903":[],"0a80fada010c":9,"0a80fada096c":9,"0a80fada097c":9,"0a80fada098c":9,"0a80fada099c":9,"100":[7,9],"11e9":9,"1616009968":[],"1616010780":[],"1616010781":[],"1616018689":[],"1617985382":[],"1617997157":[],"1618604466":[],"1618606085":[],"1619452575":[],"1619452576":[],"1619720217":9,"1619720218":10,"2280114":[],"255e396f":9,"2889535":[],"333":5,"343434344":9,"363455714":9,"3910377":[],"3926728":[],"4036705":10,"450c":9,"4714":7,"473d83400bc1bc9dc635e334fadd433c":9,"473d83400bc1bc9dc635e334faddd33c":9,"473d83400bc1bc9dc635e334fadde33c":9,"473d83400bc1bc9dc635e334faddf33c":9,"4904246":[],"543434443":9,"5495481":[],"5964222":[],"641011":[],"6572418":[],"6f90":7,"7461076":[],"7649791":[],"7d3d8d2083b4":9,"8420":7,"9159722":[],"934012":9,"93d9af72":9,"9644923":[],"9a07":9,"boolean":3,"class":[0,2,9,11],"default":[0,6,7,9,10],"export":7,"function":[2,3,4,5,8,9,10],"import":9,"int":[3,5,6,7,9,10],"new":[0,3],"public":[3,5],"return":[0,3,4,5,6,7,9],"true":[3,4,5,6,7,9,10],"while":[0,1,3,4,5,6,7,11],But:5,For:[1,5,6,7,8,9],NOT:10,One:6,Such:8,That:3,The:[0,1,2,3,5,7,9],There:9,These:8,Used:9,Will:[4,6],_get_acl_from_row:9,_get_authz_from_row:9,_get_file_name_from_row:9,_get_file_size_from_row:9,_get_guid_for_row:10,_get_guid_from_row:9,_get_md5_from_row:9,_get_urls_from_row:9,_guid_typ:10,_query_for_associated_indexd_record_guid:10,_ssl:[3,4,5],a5c6:9,ab167e49d25b488939b1ede42752458b:3,about:[2,3],abov:9,access:[0,1,3,6],accesstoken:0,acl:[3,9],across:9,action:[8,9],actual:9,add:[3,5],added:3,addit:3,admin:[5,9],admin_endpoint_suffix:5,against:[3,6,7,9,10],algorithm:3,alia:3,aliv:6,all:[3,4,5,6,7,9,10],allow:[7,9,10],along:2,alreadi:8,altern:9,alwai:5,ammount:10,amount:8,ani:[5,9],anoth:5,api:[5,7,9],api_kei:9,appli:6,appropri:11,arbitrari:0,argument:[0,11],arrai:7,asc:6,assign:8,associ:[3,5],assum:9,async:[4,8,9,10],async_cr:5,async_create_record:3,async_download_object_manifest:9,async_get:5,async_get_record:3,async_get_records_on_pag:3,async_get_with_param:3,async_ingest_metadata_manifest:10,async_query_url:3,async_query_urls_from_indexd:10,async_run_job_and_wait:4,async_upd:5,async_update_record:3,async_verify_object_manifest:9,asynchron:[3,4,5],asyncio:[9,10],attach:[3,5],attribut:9,auth:[1,2,3,4,5,6,7,9,10,11],auth_provid:[1,3,4,5,6,7,11],authbas:0,authent:0,authz:[0,3,9],auto:[0,2],automat:0,avail:[1,2],b0f1:9,base:[0,1,3,4,5,6,7,8,9,11],baseid:3,basic:[3,9,10],batch_creat:5,batch_siz:7,behavior:9,belong:7,below:9,blank:3,blob:[5,6],bodi:3,bool:[4,5,7,9,10],broad:8,broken:8,call:11,can:[0,3,4,7,9,10],capabl:8,categori:8,ccle:7,ccle_one_record:7,ccle_sample_nod:7,cdi:6,chang:[3,9],chunk_siz:7,client:3,code:[2,7],column:[9,10],column_to_use_as_guid:10,com:6,comma:9,common:[0,1,3,4,5,6,7,8,9,10,11],commons_url:[9,10],complet:4,complex:6,concat:9,concurr:[9,10],connect:10,consist:3,constructor:0,contain:[0,2,5,7,8,9,10],content:[3,11],control:3,copi:11,coroutin:9,correspond:3,crdc:0,creat:[3,4,5,7,9],create_blank:3,create_index_key_path:5,create_job:4,create_new_vers:3,create_program:7,create_project:7,create_record:3,cred:3,credenti:[0,1,3,4,5,6,7,9,11],csv:[7,9,10],curl:0,current:7,current_dir:9,custom:9,d70b41b9:7,data:[0,3,5,6,7],data_spreadsheet:7,data_typ:6,databas:5,dbgap:10,dcf:7,def:9,defin:[5,7],delai:4,delet:[0,3,5,7],delete_index_key_path:5,delete_nod:7,delete_program:7,delete_project:7,delete_record:[3,7],delimet:[9,10],delimit:9,demograph:7,desir:9,dest_path:11,dest_urlstr:11,dest_w:11,dest_wskei:11,detail:[2,6],determin:[9,10],dev:9,dict:[3,4,5,9,10],dictionari:[3,4,5,6,7],did:3,differ:5,directori:9,disk:11,dispatch:4,dist_resolut:3,distribut:3,doc:6,docstr:2,document:3,doe:[0,10],domain:[9,10],done:4,download:[0,1,2,3,4,5,6,7,8,11],download_manifest:9,download_url:11,e043ab8b77b9:7,each:[3,7,9],effici:8,either:7,elasticsearch:6,els:[0,10],elsewher:10,empti:7,end:5,endpoint:[0,1,3,4,5,6,7,11],entir:7,entri:3,env:0,environ:0,equal:6,error:[9,10],error_nam:9,etc:7,everi:[8,9],exampl:[0,1,3,4,5,6,7,9,11],exclud:3,execut:[6,7],exist:[3,5,8,10],expect:[5,8,9],experi:7,expir:0,export_nod:7,export_record:7,extent:9,f1f8:9,fail:7,fals:[3,5,9],featur:1,fenc:0,field:[3,5,6,9,10],fieldnam:9,file:[0,2,3,4,7,8,9,10,11],file_nam:[3,9],file_s:9,file_st:3,fileformat:7,filenam:[0,7,9,10],fill:10,filter:[5,6],filter_object:6,first:[6,7],flag:9,folder:9,follow:[0,9],form:11,format:[3,5,7,9],from:[0,1,2,3,4,5,6,7,8,9,10,11],func_to_parse_row:[9,10],gen3:[9,10],gen3_api_kei:0,gen3auth:[0,1,3,4,5,6,7,9,10,11],gen3fil:1,gen3index:3,gen3job:4,gen3metadata:5,gen3queri:6,gen3submiss:7,gen3wsstorag:11,gener:[0,1,2,3,4,5,6,7,11],get:[0,1,3,4,5,7,9,10,11],get_access_token:0,get_all_record:3,get_and_verify_fileinfos_from_manifest:9,get_and_verify_fileinfos_from_tsv_manifest:9,get_dictionary_al:7,get_dictionary_nod:7,get_graphql_schema:7,get_guid_from_fil:10,get_index_key_path:5,get_latest_vers:3,get_output:4,get_presigned_url:1,get_program:7,get_project:7,get_project_dictionari:7,get_project_manifest:7,get_record:3,get_record_doc:3,get_records_on_pag:3,get_stat:3,get_statu:4,get_url:3,get_vers:[3,4,5],get_with_param:3,giangb:9,github:[2,6],give:1,given:[0,3,4,5,7,10,11],global:4,good:3,graph:7,graphql:[6,7],graphql_queri:6,group:3,guid:[1,3,5,9,10],guid_exampl:9,guid_for_row:10,guid_from_fil:10,guid_type_for_indexed_file_object:10,guid_type_for_non_indexed_file_object:10,guppi:6,handl:3,has:9,has_vers:3,hash:[3,9],hash_typ:3,have:[5,9],header:9,healthi:[3,4,5],help:9,helper:2,hit:9,how:[7,9],http:[1,6,9,10],idea:3,identifi:[3,8],idp:0,ids:3,immut:3,implement:0,includ:[0,3],include_additional_column:9,index:[0,2,5,8],index_manifest:9,index_object_manifest:9,indexd:[3,9,10],indexd_field:[9,10],indexd_record_page_s:9,indexed_file_object_guid:10,indic:[0,9],info:[3,9],inform:[2,3],ingest:[2,8],ingest_manifest:10,initi:0,input:[4,9],instal:[0,2],instanc:[1,3,6,7,8],instead:6,integ:[3,7],interact:[1,3,4,5,7,11],interpret:0,introspect:7,involv:8,is_healthi:[3,4,5],is_indexed_file_object:10,its:3,job:2,job_id:4,job_input:4,job_nam:4,json:[0,1,3,4,5,6,7,9,11],just:[5,9,10],jwt:0,kei:[0,3,5,11],kwarg:[4,5],larg:8,latest:3,least:3,librari:9,like:[3,5,8,9,10],limit:[1,3,5,10],linear:4,list:[3,4,5,6,7,9,11],list_job:4,live:[9,10],local:[0,11],lock:10,log:[7,9,10],logic:[5,10],loop:9,ls_path:11,made:3,mai:8,make:[8,9],manag:[1,5],mani:[7,9],manifest:[7,8,9,10],manifest_fil:[9,10],manifest_file_delimit:[9,10],manifest_row_pars:[9,10],map:[0,9],mark:7,master:6,match:[3,5,10],max:5,max_concurrent_request:[9,10],max_tri:7,maximum:[9,10],md5:[3,9],md5_hash:9,mds:[5,10],mean:7,mechan:3,metadata:[2,3,8],metadata_list:5,metadata_sourc:10,metadata_typ:10,metdata:10,method:6,minut:0,mode:6,modul:[2,9],more:[2,5,6,8],most:8,mostli:2,multipl:[7,9],must:5,my_field:6,my_index:6,my_program:6,my_project:6,name:[3,4,7,9,10,11],namespac:10,necessari:[3,5],need:[3,6,9],nest:5,net:9,node:7,node_nam:7,node_typ:7,none:[0,1,3,4,5,6,7,9,10,11],note:[3,9,10],noth:3,now:[1,7],num:5,num_process:9,num_total_fil:9,number:[3,6,7,9,10],object:[1,3,4,5,6,7,8,9,11],off:5,offset:[5,6],old:3,one:[3,5,9],onli:[3,5,6,7],open:[7,9],open_project:7,opt:0,option:[0,1,3,4,5,6,7,9],order:[0,7],ordered_node_list:7,output:[4,5,9,10],output_filenam:[9,10],overrid:[9,10],overwrit:5,page:[0,1,2,3,4,5,6,7,9,11],pagin:3,parallel:9,param:[3,7],paramet:[0,1,3,4,5,6,7,9,10,11],pars:[9,10,11],parser:[9,10],pass:[0,6,7],password:[9,10],path:[0,5,9,11],path_to_manifest:9,pattern:[3,10],pdcdatastor:9,per:[9,10],peregrin:7,persist:8,phs0001:9,phs0002:9,pick:1,pla:9,place:9,planx:9,point:[0,1,3,4,5,6,7,11],popul:10,posit:6,post:[0,9],presign:1,prev_guid:9,previou:[3,9],previous:4,print:7,process:9,processed_fil:9,profil:[0,1,3,4,5,6,7,11],program:[7,9],progress:7,project:[7,9],project_id:[6,7],protocol:1,provid:[0,3,5,6,7,10],put:0,python:[2,8,9],queri:[1,2,3,5,7,10],query_str:6,query_txt:[6,7],query_url:3,quickstart:2,rather:0,raw:[6,9],raw_data_download:6,rbac:3,read:[3,5],readm:2,record:[3,5,6,7,9,10],refresh:0,refresh_access_token:0,refresh_fil:[0,1,3,4,5,6,7,11],refresh_token:0,regist:7,regular:6,relat:8,remov:11,replac:9,replace_url:9,repo:2,repres:[3,5],represent:3,request:[0,3,7,9,10],respect:6,respons:[0,3,4],result:7,retri:7,retriev:[1,7,10],return_full_metadata:5,rev:3,revers:7,revis:3,right:1,rm_path:11,root:[9,10],row:[6,7,9,10],row_offset:7,run:7,safe:9,same:[5,9,11],sampl:7,sandbox:[0,1,3,4,5,6,7,11],schema:7,screen:7,script:2,search:[0,2,3],second:4,see:[6,9],semaphon:10,semaphor:10,separ:9,servic:[1,3,4,5,7,10,11],service_loc:[3,4,5],session:9,set:0,setup:2,sheepdog:7,should:[7,9],shown:9,signpost:3,simpl:3,simpli:9,sinc:3,singl:7,size:[3,9],skip:7,sleep:4,some:[0,2],sort:6,sort_field:6,sort_object:6,sourc:[0,1,2,3,4,5,6,7,9,10,11],space:9,specif:[5,7,9,10],specifi:[0,3,11],spreadsheet:7,src_path:11,src_urlstr:11,src_w:11,src_wskei:11,ssl:[3,4,5],start:[3,4,6,7],statu:4,storag:2,store:3,str:[0,1,3,4,5,6,7,9,10],string:[0,3,5,9,11],strip:9,sub:7,subject:[6,7],submiss:2,submit:7,submit_fil:7,submit_record:7,submitter_id:6,suffici:3,suppli:3,support:[0,1,5,7,9],synchron:9,syntax:6,system:[6,7,8],tab:9,task:8,temporari:9,test1:9,test2:9,test3:9,test4:9,test5:9,test:9,text:[6,7],than:[0,5],thei:0,them:9,thi:[0,1,2,3,4,5,6,7,9,10,11],those:9,thread:9,thread_num:9,threadcontrol:9,through:[7,9],tier:6,time:[1,7,9],tmp_folder:9,token:0,tool:2,total:9,treat:5,tsv:[7,9,10],tupl:[3,9,10],type:[3,4,5,6,7,9,10],unaccess:6,under:[0,7,11],until:4,updat:[3,5,9],update_blank:3,update_record:3,upload:[3,7,11],upload_url:11,url:[1,3,8,9,10,11],urls_metadata:3,usag:9,use:[0,1,3,4,5,6,9,10],used:[5,10],user:[0,10],using:[0,1,3,4,5,6,7,9,11],usual:10,util:8,uuid1:7,uuid2:7,uuid:[3,7],valid:6,valu:[0,3,5,6,9],value_from_indexd:9,value_from_manifest:9,variabl:[0,6,7],variou:2,verbos:[6,7],verif:9,verifi:[2,8],verify_manifest:9,verify_object_manifest:9,version:[3,4,5],vital_statu:6,wait:4,want:[3,7],web:0,what:5,when:[0,3,6,10],where:[3,5,9,10],whether:[3,4,5,7,10],which:7,whose:5,within:[0,2,8],without:[3,5],won:5,work:0,workaround:9,worksheet:7,workspac:[0,2],wrapper:9,write:9,ws_urlstr:11,wskei:11,wss:11,wts:0,xlsx:7,you:[3,7,9]},titles:["Gen3 Auth Helper","Gen3 File Class","Welcome to Gen3 SDK\u2019s documentation!","Gen3 Index Class","Gen3 Jobs Class","Gen3 Metadata Class","Gen3 Query Class","Gen3 Submission Class","Gen3 Tools","Indexing Tools","Metadata Tools","Gen3 Workspace Storage"],titleterms:{"class":[1,3,4,5,6,7],auth:0,document:2,download:9,file:1,gen3:[0,1,2,3,4,5,6,7,8,11],helper:0,index:[3,9],indic:2,ingest:10,job:4,metadata:[5,10],queri:6,sdk:2,storag:11,submiss:7,tabl:2,tool:[8,9,10],verifi:9,welcom:2,workspac:11}}) \ No newline at end of file diff --git a/docs/_build/html/tools/indexing.html b/docs/_build/html/tools/indexing.html index 02e8dd341..a20623cab 100644 --- a/docs/_build/html/tools/indexing.html +++ b/docs/_build/html/tools/indexing.html @@ -364,7 +364,7 @@

Indexing Tools
-async gen3.tools.indexing.verify_manifest.async_verify_object_manifest(commons_url, manifest_file, max_concurrent_requests=24, manifest_row_parsers={'acl': <function _get_acl_from_row>, 'authz': <function _get_authz_from_row>, 'file_name': <function _get_file_name_from_row>, 'file_size': <function _get_file_size_from_row>, 'guid': <function _get_guid_from_row>, 'md5': <function _get_md5_from_row>, 'urls': <function _get_urls_from_row>}, manifest_file_delimiter=None, output_filename='verify-manifest-errors-1619452575.9644923.log')[source]
+async gen3.tools.indexing.verify_manifest.async_verify_object_manifest(commons_url, manifest_file, max_concurrent_requests=24, manifest_row_parsers={'acl': <function _get_acl_from_row>, 'authz': <function _get_authz_from_row>, 'file_name': <function _get_file_name_from_row>, 'file_size': <function _get_file_size_from_row>, 'guid': <function _get_guid_from_row>, 'md5': <function _get_md5_from_row>, 'urls': <function _get_urls_from_row>}, manifest_file_delimiter=None, output_filename='verify-manifest-errors-1619720217.934012.log')[source]

Verify all file object records into a manifest csv

Parameters
diff --git a/docs/_build/html/tools/metadata.html b/docs/_build/html/tools/metadata.html index 24256ca00..d7b170291 100644 --- a/docs/_build/html/tools/metadata.html +++ b/docs/_build/html/tools/metadata.html @@ -102,7 +102,7 @@

Metadata Tools
-async gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest(commons_url, manifest_file, metadata_source, auth=None, max_concurrent_requests=24, manifest_row_parsers={'guid_for_row': <function _get_guid_for_row>, 'indexed_file_object_guid': <function _query_for_associated_indexd_record_guid>}, manifest_file_delimiter=None, output_filename='ingest-metadata-manifest-errors-1619452576.3926728.log', get_guid_from_file=True, metadata_type=None)[source]
+async gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest(commons_url, manifest_file, metadata_source, auth=None, max_concurrent_requests=24, manifest_row_parsers={'guid_for_row': <function _get_guid_for_row>, 'indexed_file_object_guid': <function _query_for_associated_indexd_record_guid>}, manifest_file_delimiter=None, output_filename='ingest-metadata-manifest-errors-1619720218.4036705.log', get_guid_from_file=True, metadata_type=None)[source]

Ingest all metadata records into a manifest csv

Parameters
diff --git a/gen3/tools/indexing/manifest_columns.py b/gen3/tools/indexing/manifest_columns.py index b167a05c0..bc4b3ca8f 100644 --- a/gen3/tools/indexing/manifest_columns.py +++ b/gen3/tools/indexing/manifest_columns.py @@ -180,7 +180,7 @@ def _parse_multiple_values(values): ['/a', '/b'] ['/a', '/b'] """ - values = values.translate(values.maketrans("[],\"'", " ")) + values = values.translate(values.maketrans("[]\"'", " ")) return values.split() diff --git a/gen3/tools/indexing/merge_manifests.py b/gen3/tools/indexing/merge_manifests.py index 8102f633f..3891beea7 100644 --- a/gen3/tools/indexing/merge_manifests.py +++ b/gen3/tools/indexing/merge_manifests.py @@ -1,3 +1,28 @@ +""" +Merging indexing manifests with arbitrary columns + +Example: + + guid md5 size urls authz more_data + dg/123 f7cb... 42 http://cats.com /foo moredata ++ + guid md5 size urls authz extra_data + dg/123 f7cb... 42 s3://bucket/cats /baz stuff += + acl authz guid md5 size urls extra_data more_data + /baz /foo dg/123 f7cb... 42 http://cats.com s3://bucket/cats stuff moredata + +Is able to handle situations where multiple different guids for the same hash is +allowed. For example, if the following is valid: + +guid md5 size +dg/124 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 +dg/123 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 + +By default, this will NOT allow multiple GUIDs per hash and will try to merge all +into one. + +""" import os import logging import csv @@ -22,6 +47,7 @@ def merge_bucket_manifests( output_manifest="merged-bucket-manifest.tsv", continue_after_error=False, allow_mult_guids_per_hash=False, + columns_with_arrays=None, **kwargs, ): """ @@ -52,10 +78,17 @@ def merge_bucket_manifests( of this code is to combine such entries, however, in cases where you have existing GUIDs with the same md5 but still want to merge manifests together, this can be used. + columns_with_arrays(list[str]): list of column names where their values should + be treated like arrays (so that when merging we know to combine) Returns: None """ + columns_with_arrays = columns_with_arrays or [] + columns_with_arrays.extend( + [URLS_STANDARD_KEY, ACL_STANDARD_KEY, AUTHZ_STANDARD_KEY] + ) + files = files or [] if not files: logging.info(f"Iterating over manifests in {directory} directory") @@ -70,137 +103,288 @@ def merge_bucket_manifests( records_from_file, _ = get_and_verify_fileinfos_from_manifest( manifest, include_additional_columns=True ) + records_with_no_guid = [] for record in records_from_file: - record_to_write = copy.deepcopy(record) - if record[MD5_STANDARD_KEY] in all_rows: - previous_guid_exists = False - # if the record already exists, let's start with existing data and - # update as needed - record_to_write = copy.deepcopy(all_rows[record[MD5_STANDARD_KEY]][-1]) - - if GUID_STANDARD_KEY in record: - guid = record[GUID_STANDARD_KEY] - if ( - guid - and record_to_write.get(GUID_STANDARD_KEY) - and guid != record_to_write.get(GUID_STANDARD_KEY) - ): - error_msg = ( - "Found two objects with the same hash but different guids," - f" could not merge. Details: object {record} could not be" - f" merged with object {record_to_write} because {guid} !=" - f" {record_to_write.get(GUID_STANDARD_KEY)}." - ) - logging.error(error_msg) + # simple case where this is the first time we've seen this hash + headers.update(record.keys()) + if record[MD5_STANDARD_KEY] not in all_rows: + record_to_write = copy.deepcopy(record) + all_rows[record_to_write[MD5_STANDARD_KEY]] = [record_to_write] + else: + # if the hash already exists, we need to try and update existing + # entries with any new data (and ensure we don't add duplicates) + new_guid = record.get(GUID_STANDARD_KEY) - if not continue_after_error and not allow_mult_guids_per_hash: - raise csv.Error(error_msg) + if not new_guid: + # since there's no guid specified to differentiate this from other + # entries, we will add metadata to all records later + records_with_no_guid.append(record) + continue - previous_guid_exists = True + updated_records = _get_updated_records( + record=record, + existing_records=all_rows[record[MD5_STANDARD_KEY]], + continue_after_error=continue_after_error, + allow_mult_guids_per_hash=allow_mult_guids_per_hash, + columns_with_arrays=columns_with_arrays, + ) + all_rows[record[MD5_STANDARD_KEY]] = updated_records.values() - if guid: - record_to_write[GUID_STANDARD_KEY] = guid + # for the entries where there was no GUID specified, we will add that metadata + # to all previous records + for record in records_with_no_guid: + updated_records = _get_updated_records( + record=record, + existing_records=all_rows.get(record[MD5_STANDARD_KEY], []), + continue_after_error=continue_after_error, + allow_mult_guids_per_hash=allow_mult_guids_per_hash, + columns_with_arrays=columns_with_arrays, + ) + # it's possible a record without a GUID got added if it was the FIRST + # instance of that md5, so we just need to make sure that it's removed + # if there was another GUID provided later on + # + # this also handles the edge case where there were multiple rows for the md5 + # and NO guid was provided (e.g. we want a single combined row of updated values) + any_guid_provided = [ + record.get(GUID_STANDARD_KEY) + for record in updated_records.values() + if record.get(GUID_STANDARD_KEY) + ] + if not any_guid_provided: + all_rows[record[MD5_STANDARD_KEY]] = updated_records.values() + else: + all_rows[record[MD5_STANDARD_KEY]] = [ + record + for record in updated_records.values() + if record.get(GUID_STANDARD_KEY) + ] - if SIZE_STANDARD_KEY in record: - size = record[SIZE_STANDARD_KEY] + _create_output_file( + output_manifest, headers, all_rows, output_manifest_file_delimiter + ) - if size != record_to_write[SIZE_STANDARD_KEY]: - error_msg = ( - "Found two objects with the same hash but different sizes," - f" could not merge. Details: object {record} could not be" - f" merged with object {record_to_write} because {size} !=" - f" {record_to_write[SIZE_STANDARD_KEY]}." - ) - logging.error(error_msg) - if not continue_after_error: - raise csv.Error(error_msg) +def _get_updated_records( + record, + existing_records, + continue_after_error, + allow_mult_guids_per_hash, + columns_with_arrays, +): + """ + Return a dictionary of updated records with GUIDs as keys and full dictionaries + as values (with key/values for columns+value). - # if there's a prev guid and we're allowing duplicates, we don't want - # to copy the existing url/authz/acl, so clear them out - if previous_guid_exists and allow_mult_guids_per_hash: - record_to_write = copy.deepcopy(record) + This handles the complexity of combining this new record with any existing record + where relevant, or fully adding a new entry. - if AUTHZ_STANDARD_KEY not in record_to_write: - record_to_write[AUTHZ_STANDARD_KEY] = "" - if AUTHZ_STANDARD_KEY in record: - authz = record[AUTHZ_STANDARD_KEY] - record_to_write[AUTHZ_STANDARD_KEY] = " ".join( - list( - set( - record_to_write[AUTHZ_STANDARD_KEY].split(" ") - + authz.split(" ") - ) - ) - ).strip(" ") + Args: + record (dict): dictionary of new record keys/values + existing_records (list): existing records with this same hash + continue_after_error (bool): See calling function + allow_mult_guids_per_hash (bool): See calling function + columns_with_arrays (list[str]): See calling function - if ACL_STANDARD_KEY not in record_to_write: - record_to_write[ACL_STANDARD_KEY] = "" - if ACL_STANDARD_KEY in record: - acl = record[ACL_STANDARD_KEY] - record_to_write[ACL_STANDARD_KEY] = " ".join( - list( - set( - record_to_write[ACL_STANDARD_KEY].split(" ") - + acl.split(" ") - ) - ) - ).strip(" ") - - # default value if not available - if URLS_STANDARD_KEY not in record_to_write: - record_to_write[URLS_STANDARD_KEY] = "" - # if value provided, add it to existing values - if URLS_STANDARD_KEY in record: - urls = record[URLS_STANDARD_KEY] - record_to_write[URLS_STANDARD_KEY] = " ".join( + No Longer Returned: + dict: GUIDs with full metadtata values (dicts) + + { + "dg/123": { + "acl": + "authz": "/baz /foo" + "guid": "dg/123" + "md5": "f7cb..." + "size": "42" + "urls": "http://cats.com s3://bucket/cats" + "extra_data": "stuff" + "more_data": "moredata" + }, + "dg/456": { ... } + } + + """ + updated_records = {} + new_guid = record.get(GUID_STANDARD_KEY) + new_urls = record.get(URLS_STANDARD_KEY) + + # if there's no GUID, we can assume this is metadata about + # existing records, so update *all* of them with this information + if not new_guid: + for existing_record in existing_records: + guid = existing_record.get(GUID_STANDARD_KEY) + + _error_if_invalid_size_or_guid( + record, existing_record, continue_after_error, allow_mult_guids_per_hash + ) + + logging.debug( + f"merging any new data from {record} with existing record: {existing_record}" + ) + + record_to_write = _get_updated_record( + record, + existing_record, + continue_after_error=continue_after_error, + columns_with_arrays=columns_with_arrays, + ) + + updated_records.setdefault( + record_to_write.get(GUID_STANDARD_KEY), {} + ).update(record_to_write) + + else: + # merge normally, combining + for existing_record in existing_records: + guid = existing_record.get(GUID_STANDARD_KEY) + + _error_if_invalid_size_or_guid( + record, existing_record, continue_after_error, allow_mult_guids_per_hash + ) + + if guid == new_guid: + logging.debug( + f"merging any new data from {record} with existing record: {existing_record}" + ) + + record_to_write = _get_updated_record( + record, + existing_record, + continue_after_error=continue_after_error, + columns_with_arrays=columns_with_arrays, + ) + + updated_records.setdefault( + record_to_write.get(GUID_STANDARD_KEY), {} + ).update(record_to_write) + else: + record_to_write = copy.deepcopy(record) + + updated_records.setdefault( + record_to_write.get(GUID_STANDARD_KEY), {} + ).update(record_to_write) + updated_records.setdefault( + existing_record.get(GUID_STANDARD_KEY), {} + ).update(existing_record) + + return updated_records + + +def _error_if_invalid_size_or_guid( + record, existing_record, continue_after_error, allow_mult_guids_per_hash +): + """Log and raise errors based on cfg if hashes don't match or there's multiple GUIDs""" + guid = existing_record.get(GUID_STANDARD_KEY) + new_guid = record.get(GUID_STANDARD_KEY) + + if SIZE_STANDARD_KEY in existing_record: + size = existing_record[SIZE_STANDARD_KEY] + + if size != record[SIZE_STANDARD_KEY]: + error_msg = ( + "Found two objects with the same hash but different sizes," + f" could not merge. Details: object {existing_record} could not be" + f" merged with object {record} because {size} !=" + f" {record[SIZE_STANDARD_KEY]}." + ) + logging.error(error_msg) + + if not continue_after_error: + raise csv.Error(error_msg) + + # at this point, the record has the same hash and size as a previous guid + # so either we're allowing an entry like that, or not + if GUID_STANDARD_KEY in existing_record: + if guid and new_guid and guid != new_guid: + warning_msg = ( + "Found two objects with the same hash but different guids," + f" could not merge. Details: object {existing_record} could not be" + f" merged with object {record} because {guid} !=" + f" {new_guid}." + ) + + if not allow_mult_guids_per_hash: + logging.error(warning_msg) + raise csv.Error(error_msg) + + info_msg = ( + f"Allowing multiple GUIDs per hash. {new_guid} has same " + f"hash as {guid}.\n Details: {record} is a different " + f"record with same hash as existing guid: {guid}." + ) + logging.info(info_msg) + + +def _get_updated_record( + new_record, + existing_record, + continue_after_error, + columns_with_arrays, +): + record_to_write = copy.deepcopy(existing_record) + + # for any column not in the standard set, either update the existing + # record with new data, or leave column as data provided + for column_name in [ + key + for key in new_record.keys() + if key + not in ( + GUID_STANDARD_KEY, + SIZE_STANDARD_KEY, + MD5_STANDARD_KEY, + ) + ]: + # first handle space-delimited columns + if column_name in columns_with_arrays: + if column_name in existing_record: + # column that has a space-delimited array of values + record_to_write[column_name] = " ".join( + sorted( list( set( - record_to_write[URLS_STANDARD_KEY].split(" ") - + urls.split(" ") + new_record[column_name].split(" ") + + existing_record[column_name].split(" ") ) ) - ).strip(" ") - - # for any column not in the standard set, either update the existing - # record with new data, or initialize field to data provided - for column_name in [ - key - for key in record.keys() - if key - not in ( - GUID_STANDARD_KEY, - SIZE_STANDARD_KEY, - MD5_STANDARD_KEY, - ACL_STANDARD_KEY, - URLS_STANDARD_KEY, - AUTHZ_STANDARD_KEY, ) - ]: - if column_name in record_to_write: - record_to_write[column_name] = " ".join( - list( - set( - record_to_write[column_name].split(" ") - + record[column_name].split(" ") - ) - ) - ).strip(" ") - else: - record_to_write[column_name] = record[column_name] + ).strip(" ") + else: + record_to_write[column_name] = " ".join( + sorted(list(set(new_record[column_name].split(" ")))) + ).strip(" ") + # handle non-space-delimited columns + else: + if not existing_record.get(column_name) or ( + existing_record.get(column_name) == new_record[column_name] + ): + # use new record when nothing in existing record or it's the same data + record_to_write[column_name] = new_record[column_name] + elif not new_record[column_name]: + # persist existing data if no new data + record_to_write[column_name] = existing_record.get(column_name, "") + else: + # old and new have different values, unsure how to merge + error_msg = ( + f"NOT merging column {column_name} for " + f"existing {existing_record} and new " + f"{new_record} because unsure how to merge the values.\nERROR: IGNORING NEW VALUE if " + f"forced to continue without error." + ) + logging.error(error_msg) - # if there's NOT a previous guid matching this record and we're NOT allowing - # duplicates, remove existing record so that we can replace with newly updated one - if not (previous_guid_exists and allow_mult_guids_per_hash): - all_rows[record_to_write[MD5_STANDARD_KEY]] = [] + if not continue_after_error: + raise csv.Error(error_msg) - for key in record_to_write.keys(): - headers.add(key) + # if we're here, that means we are just going to ignore new data + # and add a row with the existing data + + return record_to_write - all_rows.setdefault(record_to_write[MD5_STANDARD_KEY], []).append( - record_to_write - ) +def _create_output_file( + output_manifest, headers, all_rows, output_manifest_file_delimiter +): if output_manifest_file_delimiter is None: output_manifest_file_ext = os.path.splitext(output_manifest) if output_manifest_file_ext[-1].lower() == ".tsv": @@ -208,24 +392,21 @@ def merge_bucket_manifests( else: output_manifest_file_delimiter = "," - # order headers with alphabetical for standard columns, followed by alphabetical for + # order headers logically for standard columns, followed by alphabetical for # non-standard columns - stardard_headers = sorted( - [ - GUID_STANDARD_KEY, - SIZE_STANDARD_KEY, - MD5_STANDARD_KEY, - ACL_STANDARD_KEY, - URLS_STANDARD_KEY, - AUTHZ_STANDARD_KEY, - ] - ) + stardard_headers = [ + GUID_STANDARD_KEY, + SIZE_STANDARD_KEY, + MD5_STANDARD_KEY, + ACL_STANDARD_KEY, + AUTHZ_STANDARD_KEY, + URLS_STANDARD_KEY, + ] non_standard_headers = sorted( [header for header in headers if header not in stardard_headers] ) headers = stardard_headers + non_standard_headers - with open(output_manifest, "w") as outfile: logging.info(f"Writing merged manifest to {output_manifest}") logging.info(f"Headers {headers}") diff --git a/tests/merge_manifests/multiple_guids_per_hash/expected-merged-output-manifest.tsv b/tests/merge_manifests/multiple_guids_per_hash/expected-merged-output-manifest.tsv index ee591aaa4..f33309c95 100644 --- a/tests/merge_manifests/multiple_guids_per_hash/expected-merged-output-manifest.tsv +++ b/tests/merge_manifests/multiple_guids_per_hash/expected-merged-output-manifest.tsv @@ -1,3 +1,4 @@ -acl authz guid md5 size urls extra_data - dg/123 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 http://cats.com s3://bucket/cats baz foo - dg/124 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 http://cats.com bar +acl authz guid md5 size urls more_data extra_data + /baz /foo /foobar dg/123 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 http://cats.com s3://bucket/cats stuff stuff3 + /bar /foobar dg/124 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 http://cats.com stuff3 + /baz /foobar dg/125 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 s3://bucket/duplicate moredata stuff2 stuff3 diff --git a/tests/merge_manifests/multiple_guids_per_hash/input/manifest1.tsv b/tests/merge_manifests/multiple_guids_per_hash/input/manifest1.tsv index 7e20e55b1..c8c28f842 100644 --- a/tests/merge_manifests/multiple_guids_per_hash/input/manifest1.tsv +++ b/tests/merge_manifests/multiple_guids_per_hash/input/manifest1.tsv @@ -1,2 +1,3 @@ -guid md5 size urls extra_data -dg/123 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 http://cats.com foo +guid md5 size urls authz more_data +dg/123 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 http://cats.com /foo +dg/125 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 s3://bucket/duplicate /baz moredata diff --git a/tests/merge_manifests/multiple_guids_per_hash/input/manifest2.tsv b/tests/merge_manifests/multiple_guids_per_hash/input/manifest2.tsv index 76d93319a..b4534dc7e 100644 --- a/tests/merge_manifests/multiple_guids_per_hash/input/manifest2.tsv +++ b/tests/merge_manifests/multiple_guids_per_hash/input/manifest2.tsv @@ -1,3 +1,5 @@ -guid md5 size urls extra_data -dg/123 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 s3://bucket/cats baz -dg/124 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 http://cats.com bar +guid md5 size urls authz extra_data + f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 /foobar stuff3 +dg/123 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 s3://bucket/cats /baz stuff +dg/124 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 http://cats.com /bar +dg/125 f7cbeb4f7fcc139d95cb9cc1cf0696ec 42 s3://bucket/duplicate /baz stuff2 diff --git a/tests/merge_manifests/test_manifest_merge.py b/tests/merge_manifests/test_manifest_merge.py index 6f1356d82..9ad0ea8cc 100644 --- a/tests/merge_manifests/test_manifest_merge.py +++ b/tests/merge_manifests/test_manifest_merge.py @@ -12,6 +12,7 @@ def test_regular_merge_bucket_manifests(): merge_bucket_manifests( directory="tests/merge_manifests/regular/input/", output_manifest="merged-output-test-manifest.tsv", + columns_with_arrays=["extra_data", "more_data", "some_additional_data"], ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/regular/expected-merged-output-manifest.tsv" @@ -25,6 +26,7 @@ def test_writing_to_csv(): merge_bucket_manifests( directory="tests/merge_manifests/regular/input/", output_manifest="merged-output-test-manifest.csv", + columns_with_arrays=["extra_data", "more_data", "some_additional_data"], ) assert _get_tsv_data("merged-output-test-manifest.csv", ",") == _get_tsv_data( "tests/merge_manifests/regular/expected-merged-output-manifest.tsv" @@ -38,6 +40,7 @@ def test_multiple_guids_per_hash(): merge_bucket_manifests( directory="tests/merge_manifests/multiple_guids_per_hash/input", output_manifest="merged-output-test-manifest.tsv", + columns_with_arrays=["extra_data", "more_data", "some_additional_data"], allow_mult_guids_per_hash=True, ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( @@ -52,6 +55,7 @@ def test_same_guid_for_same_hash(): merge_bucket_manifests( directory="tests/merge_manifests/same_guid_for_same_hash/input", output_manifest="merged-output-test-manifest.tsv", + columns_with_arrays=["extra_data", "more_data", "some_additional_data"], allow_mult_guids_per_hash=True, ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( @@ -66,6 +70,7 @@ def test_multiple_urls(): merge_bucket_manifests( directory="tests/merge_manifests/multiple_urls/input", output_manifest="merged-output-test-manifest.tsv", + columns_with_arrays=["extra_data", "more_data", "some_additional_data"], ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/multiple_urls/expected-merged-output-manifest.tsv" @@ -80,6 +85,7 @@ def test_duplicate_values(): merge_bucket_manifests( directory="tests/merge_manifests/duplicate_values/input", output_manifest="merged-output-test-manifest.tsv", + columns_with_arrays=["extra_data", "more_data", "some_additional_data", "food"], ) assert _get_tsv_data("merged-output-test-manifest.tsv") == _get_tsv_data( "tests/merge_manifests/duplicate_values/expected-merged-output-manifest.tsv" @@ -95,6 +101,7 @@ def test_size_mismatch(): merge_bucket_manifests( directory="tests/merge_manifests/size_mismatch/input", output_manifest="merged-output-test-manifest.tsv", + columns_with_arrays=["extra_data", "more_data", "some_additional_data"], ) diff --git a/tests/test_manifest.csv b/tests/test_manifest.csv index 26a2af162..0fdae5972 100644 --- a/tests/test_manifest.csv +++ b/tests/test_manifest.csv @@ -1,4 +1,4 @@ guid,authz,acl,file_size,md5,urls dg.TEST/f2a39f98-6ae1-48a5-8d48-825a0c52a22b,/programs/DEV/projects/test,DEV test,123,a1234567891234567890123456789012,gs://test/test.txt s3://testaws/aws/test.txt -dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2,/programs/DEV/projects/test2,DEV,235,c1234567891234567890123456789012,gs://test/test3.txt -dg.TEST/9c205cd7-c399-4503-9f49-5647188bde66,/programs/DEV/projects/test3 /programs/DEV/projects/test3bak,DEV test3,334,b1334567891334567890133456789013,gs://test/test.txt +dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2,/programs/DEV/projects/test2,DEV,235,c1234567891234567890123456789012,"gs://test/test%203.txt s3://testaws/file%20space.txt s3://testaws/aws/file,with,comma.txt" +dg.TEST/9c205cd7-c399-4503-9f49-5647188bde66,/programs/DEV/projects/test3 /programs/DEV/projects/test3bak,DEV test3,334,b1334567891334567890133456789013,gs://test/test.txt diff --git a/tests/test_manifests.py b/tests/test_manifests.py index 930c5ef21..9738fb359 100644 --- a/tests/test_manifests.py +++ b/tests/test_manifests.py @@ -72,7 +72,13 @@ def test_verify_manifest(mock_index): assert "c1234567891234567890123456789012" in logs[ "dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2" ].get("md5", {}).get("expected") - assert "gs://test/test3.txt" in logs[ + assert "gs://test/test 3.txt" in logs[ + "dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2" + ].get("urls", {}).get("expected") + assert "s3://testaws/file space.txt" in logs[ + "dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2" + ].get("urls", {}).get("expected") + assert "s3://testaws/aws/file,with,comma.txt" in logs[ "dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2" ].get("urls", {}).get("expected") diff --git a/tests/validate_manifest_format/manifests/manifest_with_invalid_urls.tsv b/tests/validate_manifest_format/manifests/manifest_with_invalid_urls.tsv index 0bb2060d4..72afc5532 100644 --- a/tests/validate_manifest_format/manifests/manifest_with_invalid_urls.tsv +++ b/tests/validate_manifest_format/manifests/manifest_with_invalid_urls.tsv @@ -13,10 +13,10 @@ authz acl file_size md5 urls /programs/DEV/projects/test3 /programs/DEV/projects/test3bak DEV test3 334 51bf75c48761b2e755adc1340e5a9255 https://www.uchicago.edu /programs/DEV/projects/test3 /programs/DEV/projects/test3bak DEV test3 334 51bf75c48761b2e755adc1340e5a9256 https://www.uchicago.edu/about /programs/DEV/projects/test3 /programs/DEV/projects/test3bak DEV test3 334 51bf75c48761b2e755adc1340e5a9257 google.com/path -/programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a9258 "" +/programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a9258 /programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a9259 '' /programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a925a [] /programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a925b [''] -/programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a925c [""] -/programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a925d ["", ""] -/programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a925e ["", ''] +/programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a925c "[""""]" +/programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a925d "["""" """"]" +/programs/DEV/projects/test3 DEV test3 334 51bf75c48761b2e755adc1340e5a925e "["""" '']" diff --git a/tests/validate_manifest_format/manifests/manifest_with_no_errors.tsv b/tests/validate_manifest_format/manifests/manifest_with_no_errors.tsv index ed9c35cb9..9b1a3aaf7 100644 --- a/tests/validate_manifest_format/manifests/manifest_with_no_errors.tsv +++ b/tests/validate_manifest_format/manifests/manifest_with_no_errors.tsv @@ -4,11 +4,12 @@ authz acl file_size md5 urls /a /programs/DEV/projects/test3bak DEV test3 334 51bf75c48761b2e755adc1340e5a9259 gs://test/test.txt [/programs/DEV/projects/test3, /programs/DEV/projects/test3bak] DEV test3 334 51bf75c48761b2e755adc1340e5a9259 gs://test/test.txt /programs/DEV/ DEV 235 d9a68f3d5d9ce03f8a08f50924247224 [gs://test/test3.txt] -"/programs/DEV/projects/test2" DEV "235" d9a68f3d5d9ce03f8a08f50924247225 ["gs://test/test3.txt"] +/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f50924247225 "[""gs://test/test3.txt""]" '/programs/DEV/projects/test2' DEV 235 d9a68f3d5d9ce03f8a08f50924247226 ['gs://test/test3.txt'] /programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f50924247227 [gs://test/test.txt s3://testaws/aws/test.txt] -/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f50924247228 ['gs://test/test3.txt', 's3://testaws/aws/test.txt'] -/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f50924247229 ["gs://test/test3.txt", "s3://testaws/aws/test.txt"] -/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f5092424722a ["gs://test/test3.txt", 's3://testaws/aws/test.txt'] -/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f5092424722b "gs://test/test3.txt" +/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f50924247228 ['gs://test/test3.txt' 's3://testaws/aws/test.txt'] +/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f50924247229 "[""gs://test/test3.txt"" ""s3://testaws/aws/test.txt""]" +/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f5092424722a "[""gs://test/test3.txt"" 's3://testaws/aws/test.txt']" +/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f5092424722b gs://test/test3.txt /programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f5092424722c 'gs://test/test3.txt' +/programs/DEV/projects/test2 DEV 235 d9a68f3d5d9ce03f8a08f5092424722c gs://test/test%203.txt s3://testaws/file%20space.txt s3://testaws/aws/file,with,comma.txt diff --git a/tests/validate_manifest_format/test_is_valid_manifest_format.py b/tests/validate_manifest_format/test_is_valid_manifest_format.py index 60001c966..5af6db753 100644 --- a/tests/validate_manifest_format/test_is_valid_manifest_format.py +++ b/tests/validate_manifest_format/test_is_valid_manifest_format.py @@ -173,13 +173,16 @@ def test_is_valid_manifest_format_with_invalid_urls(caplog): assert '"https://www.uchicago.edu"' in error_log assert '"https://www.uchicago.edu/about"' in error_log assert '"google.com/path"' in error_log - assert '""""' in error_log - assert "\"''\"" in error_log - assert '"[]"' in error_log - assert "\"['']\"" in error_log - assert '"[""]"' in error_log - assert '"["", ""]"' in error_log - assert '"["", \'\']"' in error_log + + # if the url resolves to nothing after replacing characters, the log may just say + # "is empty" and not list the original value + assert '""""' in error_log or "is empty" in error_log + assert "\"''\"" in error_log or "is empty" in error_log + assert '"[]"' in error_log or "is empty" in error_log + assert "\"['']\"" in error_log or "is empty" in error_log + assert '"[""]"' in error_log or "is empty" in error_log + assert '"["" ""]"' in error_log or "is empty" in error_log + assert '"["" \'\']"' in error_log or "is empty" in error_log assert result == False