mirror of
https://github.com/MacPaw/XADMaster.git
synced 2025-08-28 19:13:49 +02:00
455 lines
11 KiB
Objective-C
455 lines
11 KiB
Objective-C
/*
|
|
* XADString.m
|
|
*
|
|
* Copyright (c) 2017-present, MacPaw Inc. All rights reserved.
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
* MA 02110-1301 USA
|
|
*/
|
|
#import "XADString.h"
|
|
|
|
#import "../UniversalDetector/UniversalDetector.h"
|
|
|
|
|
|
|
|
NSString *XADASCIIStringEncodingName=@"US-ASCII";
|
|
NSString *XADUTF8StringEncodingName=@"UTF-8";
|
|
|
|
NSString *XADISOLatin1StringEncodingName=@"iso-8859-1";
|
|
NSString *XADISOLatin2StringEncodingName=@"iso-8859-2";
|
|
NSString *XADISOLatin3StringEncodingName=@"iso-8859-3";
|
|
NSString *XADISOLatin4StringEncodingName=@"iso-8859-4";
|
|
NSString *XADISOLatin5StringEncodingName=@"iso-8859-5";
|
|
NSString *XADISOLatin6StringEncodingName=@"iso-8859-6";
|
|
NSString *XADISOLatin7StringEncodingName=@"iso-8859-7";
|
|
NSString *XADISOLatin8StringEncodingName=@"iso-8859-8";
|
|
NSString *XADISOLatin9StringEncodingName=@"iso-8859-9";
|
|
NSString *XADISOLatin10StringEncodingName=@"iso-8859-10";
|
|
NSString *XADISOLatin11StringEncodingName=@"iso-8859-11";
|
|
NSString *XADISOLatin12StringEncodingName=@"iso-8859-12";
|
|
NSString *XADISOLatin13StringEncodingName=@"iso-8859-13";
|
|
NSString *XADISOLatin14StringEncodingName=@"iso-8859-14";
|
|
NSString *XADISOLatin15StringEncodingName=@"iso-8859-15";
|
|
NSString *XADISOLatin16StringEncodingName=@"iso-8859-16";
|
|
|
|
NSString *XADShiftJISStringEncodingName=@"Shift_JIS";
|
|
|
|
NSString *XADWindowsCP1250StringEncodingName=@"windows-1250";
|
|
NSString *XADWindowsCP1251StringEncodingName=@"windows-1251";
|
|
NSString *XADWindowsCP1252StringEncodingName=@"windows-1252";
|
|
NSString *XADWindowsCP1253StringEncodingName=@"windows-1253";
|
|
NSString *XADWindowsCP1254StringEncodingName=@"windows-1254";
|
|
|
|
NSString *XADMacOSRomanStringEncodingName=@"macintosh";
|
|
NSString *XADMacOSJapaneseStringEncodingName=@"x-mac-japanese";
|
|
NSString *XADMacOSTraditionalChineseStringEncodingName=@"x-mac-trad-chinese";
|
|
NSString *XADMacOSKoreanStringEncodingName=@"x-mac-korean";
|
|
NSString *XADMacOSArabicStringEncodingName=@"x-mac-arabic";
|
|
NSString *XADMacOSHebrewStringEncodingName=@"x-mac-hebrew";
|
|
NSString *XADMacOSGreekStringEncodingName=@"x-mac-greek";
|
|
NSString *XADMacOSCyrillicStringEncodingName=@"x-mac-cyrillic";
|
|
NSString *XADMacOSSimplifiedChineseStringEncodingName=@"x-mac-simp-chinese";
|
|
NSString *XADMacOSRomanianStringEncodingName=@"x-mac-romanian";
|
|
NSString *XADMacOSUkranianStringEncodingName=@"x-mac-ukrainian";
|
|
NSString *XADMacOSThaiStringEncodingName=@"x-mac-thai";
|
|
NSString *XADMacOSCentralEuropeanRomanStringEncodingName=@"x-mac-centraleurroman";
|
|
NSString *XADMacOSIcelandicStringEncodingName=@"x-mac-icelandic";
|
|
NSString *XADMacOSTurkishStringEncodingName=@"x-mac-turkish";
|
|
NSString *XADMacOSCroatianStringEncodingName=@"x-mac-croatian";
|
|
|
|
static BOOL IsDataASCII(NSData *data);
|
|
|
|
@implementation XADString
|
|
|
|
+(XADString *)XADStringWithString:(NSString *)string
|
|
{
|
|
return [[[self alloc] initWithString:string] autorelease];
|
|
}
|
|
|
|
+(XADString *)analyzedXADStringWithData:(NSData *)bytedata source:(XADStringSource *)stringsource
|
|
{
|
|
[stringsource analyzeData:bytedata];
|
|
|
|
if(IsDataASCII(bytedata))
|
|
{
|
|
return [self decodedXADStringWithData:bytedata encodingName:XADASCIIStringEncodingName];
|
|
}
|
|
else
|
|
{
|
|
return [[[self alloc] initWithData:bytedata source:stringsource] autorelease];
|
|
}
|
|
}
|
|
|
|
+(XADString *)decodedXADStringWithData:(NSData *)bytedata encodingName:(NSString *)encoding
|
|
{
|
|
NSString *string=[XADString stringForData:bytedata encodingName:encoding];
|
|
return [[[self alloc] initWithString:string] autorelease];
|
|
}
|
|
|
|
|
|
|
|
|
|
+(NSString *)escapedStringForData:(NSData *)data encodingName:(NSString *)encoding
|
|
{
|
|
NSString *decstr=[XADString stringForData:data encodingName:encoding];
|
|
if(decstr) return decstr;
|
|
|
|
// Fall back on escaped ASCII if the encoding was unusable.
|
|
return [self escapedASCIIStringForBytes:[data bytes] length:[data length]];
|
|
}
|
|
|
|
+(NSString *)escapedStringForBytes:(const void *)bytes length:(size_t)length
|
|
encodingName:(NSString *)encoding
|
|
{
|
|
NSString *decstr=[XADString stringForBytes:bytes length:length encodingName:encoding];
|
|
if(decstr) return decstr;
|
|
|
|
// Fall back on escaped ASCII if the encoding was unusable.
|
|
return [self escapedASCIIStringForBytes:bytes length:length];
|
|
}
|
|
|
|
+(NSString *)escapedASCIIStringForBytes:(const void *)bytes length:(size_t)length
|
|
{
|
|
NSMutableString *str=[NSMutableString stringWithCapacity:length];
|
|
|
|
const uint8_t *byteptr=bytes;
|
|
for(int i=0;i<length;i++)
|
|
{
|
|
if(byteptr[i]<0x80) [str appendFormat:@"%c",byteptr[i]];
|
|
else [str appendFormat:@"%%%02x",byteptr[i]];
|
|
}
|
|
|
|
return [NSString stringWithString:str];
|
|
}
|
|
|
|
+(NSData *)escapedASCIIDataForString:(NSString *)string
|
|
{
|
|
NSInteger length=[string length];
|
|
NSMutableData *encdata=[NSMutableData dataWithCapacity:length];
|
|
|
|
for(int i=0;i<length;i++)
|
|
{
|
|
char bytes[8];
|
|
unichar c=[string characterAtIndex:i];
|
|
if(c<0x80)
|
|
{
|
|
bytes[0]=c;
|
|
[encdata appendBytes:bytes length:1];
|
|
}
|
|
else
|
|
{
|
|
sprintf(bytes,"%%u%04x",c&0xffff);
|
|
[encdata appendBytes:bytes length:6];
|
|
}
|
|
}
|
|
|
|
return [NSData dataWithData:encdata];
|
|
// Do not use this because Cocotron doesn't support it.
|
|
//return [string dataUsingEncoding:NSNonLossyASCIIStringEncoding];
|
|
}
|
|
|
|
|
|
|
|
|
|
-(id)initWithData:(NSData *)bytedata source:(XADStringSource *)stringsource
|
|
{
|
|
if((self=[super init]))
|
|
{
|
|
data=[bytedata retain];
|
|
string=nil;
|
|
source=[stringsource retain];
|
|
}
|
|
return self;
|
|
}
|
|
|
|
-(id)initWithString:(NSString *)knownstring
|
|
{
|
|
if((self=[super init]))
|
|
{
|
|
string=[knownstring retain];
|
|
data=nil;
|
|
source=nil;
|
|
}
|
|
return self;
|
|
}
|
|
|
|
-(void)dealloc
|
|
{
|
|
[data release];
|
|
[string release];
|
|
[source release];
|
|
[super dealloc];
|
|
}
|
|
|
|
|
|
|
|
-(BOOL)canDecodeWithEncodingName:(NSString *)encoding
|
|
{
|
|
if(string) return YES;
|
|
return [XADString canDecodeData:data encodingName:encoding];
|
|
}
|
|
|
|
-(NSString *)string
|
|
{
|
|
return [self stringWithEncodingName:[source encodingName]];
|
|
}
|
|
|
|
-(NSString *)stringWithEncodingName:(NSString *)encoding
|
|
{
|
|
if(string) return string;
|
|
if(!data) return nil;
|
|
return [XADString escapedStringForData:data encodingName:encoding];
|
|
}
|
|
|
|
-(NSData *)data
|
|
{
|
|
if(data) return data;
|
|
return [XADString escapedASCIIDataForString:string];
|
|
}
|
|
|
|
|
|
|
|
-(BOOL)encodingIsKnown
|
|
{
|
|
if(!source) return YES;
|
|
if([source hasFixedEncoding]) return YES;
|
|
return NO;
|
|
}
|
|
|
|
-(NSString *)encodingName
|
|
{
|
|
if(!source) return XADUTF8StringEncodingName; // TODO: what should this really return?
|
|
return [source encodingName];
|
|
}
|
|
|
|
-(float)confidence
|
|
{
|
|
if(!source) return 1;
|
|
return [source confidence];
|
|
}
|
|
|
|
|
|
|
|
-(XADStringSource *)source { return source; }
|
|
|
|
|
|
|
|
-(BOOL)hasASCIIPrefix:(NSString *)asciiprefix
|
|
{
|
|
if(string) return [string hasPrefix:asciiprefix];
|
|
else
|
|
{
|
|
NSInteger length=[asciiprefix length];
|
|
if([data length]<length) return NO;
|
|
|
|
const uint8_t *bytes=[data bytes];
|
|
for(NSInteger i=0;i<length;i++) if(bytes[i]!=[asciiprefix characterAtIndex:i]) return NO;
|
|
|
|
return YES;
|
|
}
|
|
}
|
|
|
|
-(XADString *)XADStringByStrippingASCIIPrefixOfLength:(int)length
|
|
{
|
|
if(string)
|
|
{
|
|
return [[[XADString alloc]
|
|
initWithString:[string substringFromIndex:length]]
|
|
autorelease];
|
|
}
|
|
else
|
|
{
|
|
return [[[XADString alloc]
|
|
initWithData:[data subdataWithRange:
|
|
NSMakeRange(length,[data length]-length)]
|
|
source:source] autorelease];
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-(BOOL)isEqual:(id)other
|
|
{
|
|
if([other isKindOfClass:[NSString class]]) return [[self string] isEqual:other];
|
|
else if([other isKindOfClass:[self class]])
|
|
{
|
|
XADString *xadstr=(XADString *)other;
|
|
|
|
if(string&&xadstr->string) return [string isEqual:xadstr->string];
|
|
else if(data&&xadstr->data&&source==xadstr->source) return [data isEqual:xadstr->data];
|
|
else return NO;
|
|
}
|
|
else return NO;
|
|
}
|
|
|
|
-(NSUInteger)hash
|
|
{
|
|
if(string) return [string hash];
|
|
else return [data hash];
|
|
}
|
|
|
|
|
|
|
|
-(NSString *)description
|
|
{
|
|
// TODO: more info?
|
|
NSString *actualstring=[self string];
|
|
if(!actualstring) return @"(nil)";
|
|
return actualstring;
|
|
}
|
|
|
|
-(id)copyWithZone:(NSZone *)zone
|
|
{
|
|
if(string) return [[XADString allocWithZone:zone] initWithString:string];
|
|
else return [[XADString allocWithZone:zone] initWithData:data source:source];
|
|
}
|
|
|
|
|
|
#ifdef __APPLE__
|
|
-(BOOL)canDecodeWithEncoding:(NSStringEncoding)encoding
|
|
{
|
|
return [self canDecodeWithEncodingName:[XADString encodingNameForEncoding:encoding]];
|
|
}
|
|
|
|
-(NSString *)stringWithEncoding:(NSStringEncoding)encoding
|
|
{
|
|
return [self stringWithEncodingName:[XADString encodingNameForEncoding:encoding]];
|
|
}
|
|
|
|
-(NSStringEncoding)encoding
|
|
{
|
|
if(!source) return NSUTF8StringEncoding; // TODO: what should this really return?
|
|
else return [source encoding];
|
|
}
|
|
#endif
|
|
|
|
@end
|
|
|
|
|
|
|
|
@implementation XADStringSource
|
|
|
|
-(id)init
|
|
{
|
|
if((self=[super init]))
|
|
{
|
|
detector=[UniversalDetector new]; // can return nil if UniversalDetector is not found
|
|
fixedencodingname=nil;
|
|
mac=NO;
|
|
hasanalyzeddata=NO;
|
|
}
|
|
return self;
|
|
}
|
|
|
|
-(void)dealloc
|
|
{
|
|
[detector release];
|
|
[fixedencodingname release];
|
|
[super dealloc];
|
|
}
|
|
|
|
-(void)analyzeData:(NSData *)data
|
|
{
|
|
hasanalyzeddata=YES;
|
|
[detector analyzeData:data];
|
|
}
|
|
|
|
-(BOOL)hasAnalyzedData { return hasanalyzeddata; }
|
|
|
|
-(NSString *)encodingName
|
|
{
|
|
if(fixedencodingname) return fixedencodingname;
|
|
if(!detector) return XADWindowsCP1252StringEncodingName;
|
|
|
|
NSString *encoding=[detector MIMECharset];
|
|
if(!encoding) encoding=XADWindowsCP1252StringEncodingName;
|
|
|
|
// Kludge to use Mac encodings instead of the similar Windows encodings for Mac archives
|
|
// TODO: improve
|
|
if(mac)
|
|
{
|
|
static NSDictionary *macalternatives=nil;
|
|
if(!macalternatives) macalternatives=[[NSDictionary alloc] initWithObjectsAndKeys:
|
|
XADMacOSRomanStringEncodingName,[XADASCIIStringEncodingName lowercaseString],
|
|
XADMacOSRomanStringEncodingName,[XADWindowsCP1252StringEncodingName lowercaseString],
|
|
XADMacOSJapaneseStringEncodingName,[XADShiftJISStringEncodingName lowercaseString],
|
|
nil];
|
|
|
|
NSString *macalternative=[macalternatives objectForKey:[encoding lowercaseString]];
|
|
if(macalternative) return macalternative;
|
|
}
|
|
|
|
return encoding;
|
|
}
|
|
|
|
-(float)confidence
|
|
{
|
|
if(fixedencodingname) return 1;
|
|
if(!detector) return 0;
|
|
if(![detector MIMECharset]) return 0;
|
|
return [detector confidence];
|
|
}
|
|
|
|
-(UniversalDetector *)detector
|
|
{
|
|
return detector;
|
|
}
|
|
|
|
-(void)setFixedEncodingName:(NSString *)encoding
|
|
{
|
|
[fixedencodingname autorelease];
|
|
fixedencodingname=[encoding retain];
|
|
}
|
|
|
|
-(BOOL)hasFixedEncoding
|
|
{
|
|
return fixedencodingname!=nil;
|
|
}
|
|
|
|
-(void)setPrefersMacEncodings:(BOOL)prefermac
|
|
{
|
|
mac=prefermac;
|
|
}
|
|
|
|
|
|
|
|
#ifdef __APPLE__
|
|
-(NSStringEncoding)encoding
|
|
{
|
|
NSString *encodingname=[self encodingName];
|
|
if(!encodingname) return 0;
|
|
|
|
return [XADString encodingForEncodingName:encodingname];
|
|
}
|
|
|
|
-(void)setFixedEncoding:(NSStringEncoding)encoding
|
|
{
|
|
if(!encoding) [self setFixedEncodingName:nil];
|
|
else [self setFixedEncodingName:[XADString encodingNameForEncoding:encoding]];
|
|
}
|
|
#endif
|
|
|
|
@end
|
|
|
|
|
|
|
|
|
|
static BOOL IsDataASCII(NSData *data)
|
|
{
|
|
const char *bytes=[data bytes];
|
|
NSInteger length=[data length];
|
|
for(NSInteger i=0;i<length;i++) if(bytes[i]&0x80) return NO;
|
|
return YES;
|
|
}
|