Skip to content

Commit

Permalink
fix WARC record content-type lookup
Browse files Browse the repository at this point in the history
  • Loading branch information
ikreymer committed Aug 23, 2024
1 parent 57cfbd2 commit bf5b364
Show file tree
Hide file tree
Showing 7 changed files with 7 additions and 7 deletions.
2 changes: 1 addition & 1 deletion dist/cli.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
`)}return e.type="warcinfo",n.create(e,s())}getResponseInfo(){let e=this.httpHeaders;return e?{headers:e.headers,status:e.statusCode,statusText:e.statusText}:null}fixUp(){let e=this.warcHeaders.headers.get("WARC-Target-URI");e&&e.startsWith("<")&&e.endsWith(">")&&this.warcHeaders.headers.set("WARC-Target-URI",e.slice(1,-1))}async readFully(e=!1){if(this.httpHeaders){if(this.payload&&!this.payload.length)return this.payload;if(this._contentReader&&!e)throw new TypeError("WARC Record decoding already started, but requesting raw payload");if(e&&this.consumed==="raw"&&this.payload)return await this._createDecodingReader([this.payload]).readFully()}return this.payload?this.payload:(e?(this.payload=await super.readFully(),this.consumed="content"):(this.payload=await y.readFully(this._reader),this.consumed="raw"),this.payload)}get reader(){if(this.payload&&!this.payload.length)return V();if(this._contentReader)throw new TypeError("WARC Record decoding already started, but requesting raw payload");return this._reader}get contentReader(){return this.httpHeaders?(this._contentReader||(this._contentReader=this._createDecodingReader(this._reader)),this._contentReader):this._reader}_createDecodingReader(e){if(!this.httpHeaders)throw new Error("WARCRecord cannot call _createDecodingReader when this.httpHeaders === null");let t=this.httpHeaders.headers.get("Content-Encoding"),s=this.httpHeaders.headers.get("Transfer-Encoding"),a=s==="chunked";return!t&&!a&&(t=s),new R(e,t,a)}async readlineRaw(e){if(this.consumed)throw new Error("Record already consumed.. Perhaps a promise was not awaited?");if(this.contentReader instanceof y)return this.contentReader.readlineRaw(e);throw new Error("WARCRecord cannot call readlineRaw on this.contentReader if it does not extend BaseAsyncIterReader")}async contentText(){let e=await this.readFully(!0);return xe.decode(e)}async*[Symbol.asyncIterator](){for await(let e of this.contentReader)if(yield e,this.consumed)throw new Error("Record already consumed.. Perhaps a promise was not awaited?");this.consumed="content"}async skipFully(){if(!this.consumed){if(this._reader instanceof w){let e=await this._reader.skipFully();return this.consumed="skipped",e}throw new Error("WARCRecord cannot call skipFully on this._reader if it is not a LimitReader")}}warcHeader(e){return this.warcHeaders.headers.get(e)}get warcType(){return this.warcHeaders.headers.get("WARC-Type")}get warcTargetURI(){return this.warcHeaders.headers.get("WARC-Target-URI")}get warcDate(){return this.warcHeaders.headers.get("WARC-Date")}get warcRefersToTargetURI(){return this.warcHeaders.headers.get("WARC-Refers-To-Target-URI")}get warcRefersToDate(){return this.warcHeaders.headers.get("WARC-Refers-To-Date")}get warcPayloadDigest(){return this.warcHeaders.headers.get("WARC-Payload-Digest")}get warcBlockDigest(){return this.warcHeaders.headers.get("WARC-Block-Digest")}get warcContentType(){return this.warcHeaders.headers.get("Content-Type")}get warcContentLength(){return Number(this.warcHeaders.headers.get("Content-Length"))}};async function*V(){}var Z=new TextDecoder,P=new Uint8Array([]),W=class n{static async parse(r,e){return new n(r,e).parse()}static iterRecords(r,e){return new n(r,e)[Symbol.asyncIterator]()}constructor(r,{keepHeadersCase:e=!1,parseHttp:t=!0}={}){this._offset=0,this._warcHeadersLength=0,this._headersClass=e?Map:Headers,this._parseHttp=t,r instanceof R?this._reader=r:this._reader=new R(r),this._record=null}async readToNextRecord(){if(!this._reader||!this._record)return P;await this._record.skipFully(),this._reader.compressed&&(this._offset=this._reader.getRawOffset());let r=await this._reader.readlineRaw(),e=0;if(!r)r=P;else{if(e=r.byteLength-1,e===9&&Z.decode(r).startsWith("WARC/"))return r;for(;e>0;){let t=r[e-1];if(t!==10&&t!==13)break;e--}e&&console.warn(`Content-Length Too Small: Record not followed by newline, Remainder Length: ${e}, Offset: ${this._reader.getRawOffset()-r.byteLength}`)}if(this._reader.compressed)await this._reader.skipSize(2),r=P;else{for(r=await this._reader.readlineRaw();r&&r.byteLength===2;)r=await this._reader.readlineRaw();this._offset=this._reader.getRawOffset(),r&&(this._offset-=r.length)}return r}_initRecordReader(r){return new w(this._reader,Number(r.headers.get("Content-Length")||0))}async parse(){let r=await this.readToNextRecord(),e=r?Z.decode(r):"",t=new k,s=await t.parse(this._reader,{firstLine:e,headersClass:this._headersClass});if(!s)return null;this._warcHeadersLength=this._reader.getReadOffset();let a=new T({warcHeaders:s,reader:this._initRecordReader(s)});if(this._record=a,this._parseHttp)switch(a.warcType){case"response":case"request":await this._addHttpHeaders(a,t);break;case"revisit":a.warcContentLength>0&&await this._addHttpHeaders(a,t);break}return a}get offset(){return this._offset}get recordLength(){return this._reader.getRawLength(this._offset)}async*[Symbol.asyncIterator](){let r=null;for(;(r=await this.parse())!==null;)yield r;this._record=null}async _addHttpHeaders(r,e){let t=await e.parse(this._reader,{headersClass:this._headersClass});r.httpHeaders=t;let s=this._reader.getReadOffset()-this._warcHeadersLength;r.reader instanceof w&&r.reader.setLimitSkip(r.warcContentLength-s)}};var Te=["offset","warc-type","warc-target-uri"],O=class{constructor(r={}){this.opts=r,this.fields=r.fields?r.fields.split(","):Te,this.parseHttp=!1}serialize(r){return JSON.stringify(r)+`
`}write(r,e){e.write(this.serialize(r))}async writeAll(r,e){for await(let t of this.iterIndex(r))this.write(t,e)}async*iterIndex(r){let e={strictHeaders:!0,parseHttp:this.parseHttp};for(let{filename:t,reader:s}of r){let a=new W(s,e);yield*this.iterRecords(a,t)}}async*iterRecords(r,e){for await(let t of r){await t.skipFully();let s=this.indexRecord(t,r,e);s&&(yield s)}}indexRecord(r,e,t){if(this.filterRecord&&!this.filterRecord(r))return null;let s={},{offset:a,recordLength:i}=e,o={offset:a,length:i,filename:t};for(let d of this.fields)d in o?s[d]=o[d]:this.setField(d,r,s);return s}setField(r,e,t){let s=this.getField(r,e);s!==null&&(t[r]=s)}getField(r,e){if(r==="http:status")return e.httpHeaders&&(e.warcType==="response"||e.warcType==="revisit")?e.httpHeaders.statusCode:null;if(r.startsWith("http:")){if(e.httpHeaders){let t=e.httpHeaders.headers;return t instanceof Map&&(t=new Headers(Object.fromEntries(t))),t.get(r.slice(5))}return null}return e.warcHeaders.headers.get(r)||null}},C=class extends O{constructor(r){super(r);for(let e of this.fields)if(e.startsWith("http:")){this.parseHttp=!0;break}}},We="urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(","),Ue="urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split(","),I=class extends C{constructor(e){super(e);switch(this.includeAll=!!e?.all,this.overrideIndexForAll=!!e?.all,this.fields=We,this.parseHttp=!0,this.noSurt=!!e?.noSurt,this._lastRecord=null,e?.format){case"cdxj":this.serialize=this.serializeCDXJ;break;case"cdx":this.serialize=this.serializeCDX11;break;case"json":default:break}}async*iterRecords(e,t){this._lastRecord=null;for await(let a of e){await a.readFully();let i=this.indexRecord(a,e,t);i&&(yield i)}let s=this.indexRecord(null,e,t);s&&(yield s)}filterRecord(e){if(this.includeAll)return!0;let t=e.warcType;return!(t==="request"||t==="warcinfo"||(t==="metadata"||t==="resource")&&e.warcContentType==="application/warc-fields")}indexRecord(e,t,s){if(this.overrideIndexForAll)return e?super.indexRecord(e,t,s):null;let a=this._lastRecord;if(this._lastRecord=e,e&&(e._offset=t.offset,e._length=t.recordLength),!a)return null;if(!e||a.warcTargetURI!=e.warcTargetURI)return this.indexRecordPair(a,null,t,s);let i=e.warcType,o=a.warcType;return i==="request"&&(o==="response"||o==="revisit")?(this._lastRecord=null,this.indexRecordPair(a,e,t,s)):(i==="response"||i==="revisit")&&o==="request"?(this._lastRecord=null,this.indexRecordPair(e,a,t,s)):this.indexRecordPair(a,null,t,s)}indexRecordPair(e,t,s,a){let i,o,d=e.warcTargetURI||"";if(t?.httpHeaders&&t.httpHeaders.method!=="GET"){let c={url:d,method:t.httpHeaders.method,headers:t.httpHeaders.headers,postData:t.payload};i=c.method,Q(c)&&(o=c.requestBody,e.method=i,e.requestBody=o,d=c.url)}e._urlkey=d;let l=super.indexRecord(e,s,a);return l&&(e._offset!==void 0&&(l.offset=e._offset,l.length=e._length),i&&(l.method=i),o&&(l.requestBody=o)),l}serializeCDXJ(e){let{urlkey:t,timestamp:s}=e;return delete e.urlkey,delete e.timestamp,`${t} ${s} ${JSON.stringify(e,(i,o)=>["offset","length","status"].includes(i)?o==null?"":""+o:o)}
`}serializeCDX11(e){let t=[];for(let s of Ue)t.push(e[s]!=null?e[s]:"-");return t.join(" ")+`
`}getField(e,t){let s=null;switch(e){case"urlkey":return s=t._urlkey||t.warcTargetURI||null,this.noSurt||s===null?s:M(s);case"timestamp":return s=t.warcDate??"",s.replace(/[-:T]/g,"").slice(0,14);case"url":return t.warcTargetURI;case"mime":switch(t.warcType){case"revisit":return"warc/revisit";case"response":case"request":e="http:content-type";break;default:e="content-type"}return s=super.getField(e,t),s?s.toString().split(";",1)[0]?.trim():null;case"status":return super.getField("http:status",t);case"digest":return s=t.warcPayloadDigest,s?s.split(":",2)[1]:null;default:return null}}};var Y="2.3.0";var Le=1024*128;async function E(n=H.stdout,r){let e=Promise.resolve();return r=r||(0,se.hideBin)(process.argv),(0,re.default)().version(Y).usage("$0 [command]").command({command:"index <filenames..>",describe:"Index WARC(s)",builder:B,handler:async t=>{e=new C(t).writeAll(ee(t.filenames),n)}}).command({command:"cdx-index <filenames..>",describe:"CDX(J) Index of WARC(s)",builder:N,handler:async t=>{e=new I(t).writeAll(ee(t.filenames),n)}}).demandCommand(1,"Please specify a command").strictCommands().help().parseAsync(r),e}function ee(n){return n.reduce((r,e)=>{if(!(0,U.lstatSync)(e).isFile())return H.stderr.write(`Skipping ${e}, not a file
`}getField(e,t){let s=null;switch(e){case"urlkey":return s=t._urlkey||t.warcTargetURI||null,this.noSurt||s===null?s:M(s);case"timestamp":return s=t.warcDate??"",s.replace(/[-:T]/g,"").slice(0,14);case"url":return t.warcTargetURI;case"mime":switch(t.warcType){case"revisit":return"warc/revisit";case"response":case"request":e="http:content-type";break;default:return t.warcContentType}return s=super.getField(e,t),s?s.toString().split(";",1)[0]?.trim():null;case"status":return super.getField("http:status",t);case"digest":return s=t.warcPayloadDigest,s?s.split(":",2)[1]:null;default:return null}}};var Y="2.3.0";var Le=1024*128;async function E(n=H.stdout,r){let e=Promise.resolve();return r=r||(0,se.hideBin)(process.argv),(0,re.default)().version(Y).usage("$0 [command]").command({command:"index <filenames..>",describe:"Index WARC(s)",builder:B,handler:async t=>{e=new C(t).writeAll(ee(t.filenames),n)}}).command({command:"cdx-index <filenames..>",describe:"CDX(J) Index of WARC(s)",builder:N,handler:async t=>{e=new I(t).writeAll(ee(t.filenames),n)}}).demandCommand(1,"Please specify a command").strictCommands().help().parseAsync(r),e}function ee(n){return n.reduce((r,e)=>{if(!(0,U.lstatSync)(e).isFile())return H.stderr.write(`Skipping ${e}, not a file
`),r;let t=(0,U.createReadStream)(e,{highWaterMark:Le});return e=(0,te.basename)(e),r.push({filename:e,reader:t}),r},[])}E();
Loading

0 comments on commit bf5b364

Please sign in to comment.