mirror of
https://github.com/MacPaw/XADMaster.git
synced 2025-08-29 03:23:48 +02:00
363 lines
11 KiB
Objective-C
363 lines
11 KiB
Objective-C
/*
|
|
* XADWARCParser.m
|
|
*
|
|
* Copyright (c) 2017-present, MacPaw Inc. All rights reserved.
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
* MA 02110-1301 USA
|
|
*/
|
|
#import "XADWARCParser.h"
|
|
#import "Scanning.h"
|
|
|
|
@implementation XADWARCParser
|
|
|
|
+(int)requiredHeaderSize { return 10; }
|
|
|
|
+(BOOL)recognizeFileWithHandle:(CSHandle *)handle firstBytes:(NSData *)data name:(NSString *)name
|
|
{
|
|
const uint8_t *bytes=[data bytes];
|
|
int length=[data length];
|
|
|
|
if(length<10) return NO;
|
|
|
|
if (memcmp(bytes,"WARC/1.0\r\n",10)==0)
|
|
{
|
|
return YES;
|
|
}
|
|
return memcmp(bytes,"WARC/1.1\r\n",10)==0;
|
|
}
|
|
|
|
-(void)parse
|
|
{
|
|
CSHandle *fh=[self handle];
|
|
|
|
NSMutableArray *recordarray=[NSMutableArray array];
|
|
NSMutableDictionary *records=[NSMutableDictionary dictionary];
|
|
|
|
// Read all WARC records into memory, along with the HTTP headers
|
|
// for application/http records.
|
|
|
|
NSMutableDictionary *lastrecord=nil;
|
|
while(![fh atEndOfFile])
|
|
{
|
|
NSAutoreleasePool *pool=[NSAutoreleasePool new];
|
|
|
|
NSString *marker=[fh readLineWithEncoding:NSUTF8StringEncoding];
|
|
if(![marker isEqual:@"WARC/1.0"] && ![marker isEqual:@"WARC/1.1"])
|
|
{
|
|
// The Content-Length record was wrong, so attempt to find the next
|
|
// record and correct the previously recorded record.
|
|
BOOL found = [fh scanForByteString:(const uint8_t *)"\r\n\r\nWARC/1.0\r\n" length:14] || [fh scanForByteString:(const uint8_t *)"\r\n\r\nWARC/1.1\r\n" length:14];
|
|
|
|
off_t realendofrecord=[fh offsetInFile];
|
|
[lastrecord setObject:[NSNumber numberWithLongLong:realendofrecord] forKey:@"EndOfRecord"];
|
|
|
|
if(!found) break;
|
|
|
|
[fh skipBytes:14];
|
|
}
|
|
|
|
NSMutableDictionary *record=[self parseHTTPHeadersWithHandle:fh];
|
|
lastrecord=record;
|
|
|
|
off_t contentstart=[fh offsetInFile];
|
|
|
|
NSString *recordid=[record objectForKey:@"WARC-Record-ID"];
|
|
NSString *contentlength=[record objectForKey:@"Content-Length"];
|
|
NSString *contenttype=[record objectForKey:@"Content-Type"];
|
|
|
|
if(!contentlength) [XADException raiseIllegalDataException];
|
|
NSScanner *scanner=[NSScanner scannerWithString:contentlength];
|
|
long long length=0;
|
|
[scanner scanLongLong:&length];
|
|
|
|
off_t endofrecord=contentstart+length;
|
|
|
|
[record setObject:[NSNumber numberWithLongLong:contentstart] forKey:@"ContentStart"];
|
|
[record setObject:[NSNumber numberWithLongLong:endofrecord] forKey:@"EndOfRecord"];
|
|
|
|
if([contenttype hasPrefix:@"application/http"])
|
|
{
|
|
NSArray *headers=[self readHTTPHeadersWithHandle:fh];
|
|
off_t bodystart=[fh offsetInFile];
|
|
|
|
[record setObject:headers forKey:@"HTTPHeaders"];
|
|
[record setObject:[NSNumber numberWithLongLong:bodystart] forKey:@"HTTPBodyStart"];
|
|
}
|
|
|
|
[recordarray addObject:record];
|
|
[records setObject:record forKey:recordid];
|
|
|
|
[fh seekToFileOffset:endofrecord+4];
|
|
|
|
[pool release];
|
|
}
|
|
|
|
// Find all response records with 200 status, and build a
|
|
// directory tree of the file names.
|
|
|
|
NSMutableArray *filerecords=[NSMutableArray array];
|
|
NSMutableDictionary *root=[NSMutableDictionary dictionary];
|
|
|
|
NSEnumerator *enumerator=[recordarray objectEnumerator];
|
|
NSMutableDictionary *record;
|
|
while((record=[enumerator nextObject]))
|
|
{
|
|
NSString *type=[record objectForKey:@"WARC-Type"];
|
|
NSArray *headers=[record objectForKey:@"HTTPHeaders"];
|
|
NSString *status=[headers objectAtIndex:0];
|
|
|
|
if([type isEqual:@"response"])
|
|
if([status matchedByPattern:@"^HTTP/[0-9]+\\.[0-9]+ 200"])
|
|
{
|
|
NSString *target=[self getTargetURI:record];
|
|
NSArray *components=[self pathComponentsForURLString:target];
|
|
if(components)
|
|
{
|
|
NSMutableDictionary *dir=root;
|
|
|
|
NSUInteger count=[components count];
|
|
for(NSUInteger i=0;i<count-1;i++)
|
|
{
|
|
NSString *component=[components objectAtIndex:i];
|
|
dir=[self insertDirectory:component inDirectory:dir];
|
|
}
|
|
|
|
[self insertFile:[components lastObject] record:record inDirectory:dir];
|
|
|
|
[filerecords addObject:record];
|
|
}
|
|
else NSLog(@"Failed to parse URL \"%@\"",target);
|
|
}
|
|
}
|
|
|
|
// Walk the finished directory tree to generate XADPaths for all files.
|
|
[self buildXADPathsForFilesInDirectory:root parentPath:[self XADPath]];
|
|
|
|
// Iterate over the files, finding and loading the request
|
|
// records and emit archive entries.
|
|
|
|
enumerator=[filerecords objectEnumerator];
|
|
while((record=[enumerator nextObject]))
|
|
{
|
|
NSString *target=[self getTargetURI:record];
|
|
NSNumber *startnum=[record objectForKey:@"HTTPBodyStart"];
|
|
NSNumber *endnum=[record objectForKey:@"EndOfRecord"];
|
|
NSArray *responseheaders=[record objectForKey:@"HTTPHeaders"];
|
|
XADPath *path=[record objectForKey:@"XADPath"];
|
|
|
|
NSNumber *lengthnum=[NSNumber numberWithLongLong:[endnum longLongValue]-[startnum longLongValue]];
|
|
|
|
NSMutableDictionary *dict=[NSMutableDictionary dictionaryWithObjectsAndKeys:
|
|
path,XADFileNameKey,
|
|
lengthnum,XADFileSizeKey,
|
|
lengthnum,XADCompressedSizeKey,
|
|
startnum,XADDataOffsetKey,
|
|
lengthnum,XADDataLengthKey,
|
|
target,@"WARCTargetURI",
|
|
responseheaders,@"WARCResponseHeaders",
|
|
nil];
|
|
|
|
NSString *requestid=[record objectForKey:@"WARC-Concurrent-To"];
|
|
NSDictionary *request=[records objectForKey:requestid];
|
|
if(request)
|
|
{
|
|
NSArray *requestheaders=[request objectForKey:@"HTTPHeaders"];
|
|
[dict setObject:requestheaders forKey:@"WARCRequestHeaders"];
|
|
|
|
NSNumber *requeststartnum=[request objectForKey:@"HTTPBodyStart"];
|
|
NSNumber *requestlengthnum=[request objectForKey:@"HTTPBodyLength"];
|
|
off_t start=[requeststartnum longLongValue];
|
|
off_t length=[requestlengthnum longLongValue];
|
|
|
|
if(length)
|
|
{
|
|
[fh seekToFileOffset:start];
|
|
NSData *requestbody=[fh readDataOfLength:(int)length];
|
|
|
|
[dict setObject:requestbody forKey:@"WARCRequestBody"];
|
|
}
|
|
}
|
|
|
|
[self addEntryWithDictionary:dict];
|
|
}
|
|
|
|
// TODO: Handle more record types, and store their contents as file and archive
|
|
// metadata. Patches welcome!
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
-(NSMutableDictionary *)parseHTTPHeadersWithHandle:(CSHandle *)handle
|
|
{
|
|
NSMutableDictionary *headers=[NSMutableDictionary dictionary];
|
|
for(;;)
|
|
{
|
|
NSString *line=[handle readLineWithEncoding:NSUTF8StringEncoding];
|
|
if([line length]==0) return headers;
|
|
|
|
NSArray *matches=[line substringsCapturedByPattern:@"^([^:]+):[ \t]+(.*)$"];
|
|
if(matches)
|
|
{
|
|
NSString *key=[matches objectAtIndex:1];
|
|
NSString *value=[matches objectAtIndex:2];
|
|
|
|
[headers setObject:value forKey:key];
|
|
}
|
|
}
|
|
}
|
|
|
|
-(NSArray *)readHTTPHeadersWithHandle:(CSHandle *)handle
|
|
{
|
|
NSMutableArray *headers=[NSMutableArray array];
|
|
for(;;)
|
|
{
|
|
NSString *line=[handle readLineWithEncoding:NSUTF8StringEncoding];
|
|
if([line length]==0) return headers;
|
|
[headers addObject:line];
|
|
}
|
|
}
|
|
|
|
-(NSString *)getTargetURI:(NSDictionary *)record
|
|
{
|
|
|
|
NSString *target=[record objectForKey:@"WARC-Target-URI"];
|
|
|
|
// WARC 1.0 requires WARC-Target-URI to be surrounded by angle brackets,
|
|
// but most tools don't respect this requirement, so we just strip them
|
|
// if they're present.
|
|
//
|
|
// Read <https://github.com/iipc/warc-specifications/issues/23> for more
|
|
// details.
|
|
if(target.length>=2 && [target characterAtIndex:0]=='<' && [target characterAtIndex:target.length-1]=='>')
|
|
{
|
|
target=[target substringWithRange:NSMakeRange(1, target.length-2)];
|
|
}
|
|
|
|
return target;
|
|
}
|
|
|
|
-(NSArray *)pathComponentsForURLString:(NSString *)urlstring
|
|
{
|
|
NSArray *matches=[urlstring substringsCapturedByPattern:@"^https?://([^/]+)(/.*|())$"];
|
|
if(!matches) return nil;
|
|
NSString *host=[matches objectAtIndex:1];
|
|
NSString *path=[matches objectAtIndex:2];
|
|
|
|
if([path length]==0) return [NSArray arrayWithObject:host];
|
|
|
|
NSMutableArray *components=[[[path pathComponents] mutableCopy] autorelease];
|
|
[components replaceObjectAtIndex:0 withObject:host];
|
|
|
|
if([[components lastObject] isEqual:@"/"]) [components removeLastObject];
|
|
|
|
// TODO: Better processing of the path, handling escapes and such.
|
|
|
|
return components;
|
|
}
|
|
|
|
-(NSMutableDictionary *)insertDirectory:(NSString *)name inDirectory:(NSMutableDictionary *)dir
|
|
{
|
|
NSMutableDictionary *entry=[dir objectForKey:name];
|
|
|
|
if(!entry)
|
|
{
|
|
// No such entry exists, so insert a new directory.
|
|
NSMutableDictionary *newdir=[NSMutableDictionary dictionary];
|
|
[dir setObject:newdir forKey:name];
|
|
return newdir;
|
|
}
|
|
else if([entry objectForKey:@"/"])
|
|
{
|
|
// A file with the same name exists. Remove the file, insert a new directory,
|
|
// then insert the file in the new directory as "index.html".
|
|
[[entry retain] autorelease];
|
|
[dir removeObjectForKey:name];
|
|
|
|
NSMutableDictionary *newdir=[NSMutableDictionary dictionary];
|
|
[dir setObject:newdir forKey:name];
|
|
|
|
[self insertFile:@"index.html" record:entry inDirectory:newdir];
|
|
|
|
return newdir;
|
|
}
|
|
else
|
|
{
|
|
// This directory already exists. No need to do anything, just return it.
|
|
return entry;
|
|
}
|
|
}
|
|
|
|
-(void)insertFile:(NSString *)name record:(NSMutableDictionary *)record inDirectory:(NSMutableDictionary *)dir
|
|
{
|
|
[record setObject:[NSNull null] forKey:@"/"]; // Mark the record as a file.
|
|
|
|
NSMutableDictionary *entry=[dir objectForKey:name];
|
|
|
|
if(!entry)
|
|
{
|
|
// No such entry exists, so insert the file.
|
|
[dir setObject:record forKey:name];
|
|
}
|
|
else if([entry objectForKey:@"/"])
|
|
{
|
|
// A file with the same name already exists. Find an unused name to use instead.
|
|
NSString *newname;
|
|
int counter=1;
|
|
do { newname=[NSString stringWithFormat:@"%@.%d",name,counter++]; }
|
|
while([dir objectForKey:newname]);
|
|
|
|
[dir setObject:record forKey:newname];
|
|
}
|
|
else
|
|
{
|
|
// A directory with the same name exists. Attempt to insert the file
|
|
// as "index.html" in that directory instead.
|
|
[self insertFile:@"index.html" record:record inDirectory:entry];
|
|
}
|
|
}
|
|
|
|
-(void)buildXADPathsForFilesInDirectory:(NSMutableDictionary *)dir parentPath:(XADPath *)parent
|
|
{
|
|
NSEnumerator *enumerator=[dir keyEnumerator];
|
|
NSString *name;
|
|
while((name=[enumerator nextObject]))
|
|
{
|
|
NSMutableDictionary *entry=[dir objectForKey:name];
|
|
XADString *xadname=[self XADStringWithString:name];
|
|
XADPath *path=[parent pathByAppendingXADStringComponent:xadname];
|
|
|
|
if([entry objectForKey:@"/"])
|
|
{
|
|
[entry setObject:path forKey:@"XADPath"];
|
|
}
|
|
else
|
|
{
|
|
[self buildXADPathsForFilesInDirectory:entry parentPath:path];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-(CSHandle *)handleForEntryWithDictionary:(NSDictionary *)dict wantChecksum:(BOOL)checksum
|
|
{
|
|
return [self handleAtDataOffsetForDictionary:dict];
|
|
}
|
|
|
|
-(NSString *)formatName { return @"WARC"; }
|
|
|
|
@end
|