rtoss

Subversion Repositories:
Compare Path: Rev
With Path: Rev
/GreenPad/ @ 132  →  /GreenPad/ @ 133
/GreenPad/OpenSaveDlg.cpp
@@ -25,6 +25,12 @@
TEXT("’†‘Œê(ISO-2022-CN)"),
TEXT("’†‘Œê(HZ)"),
TEXT("’†‘Œê(Big5)"),
TEXT("’†‘Œê(EUC-TW/CNS)"),
TEXT("’†‘Œê(TCA)"),
TEXT("’†‘Œê(ETen)"),
TEXT("’†‘Œê(IBM 5550)"),
TEXT("’†‘Œê(Teletext)"),
TEXT("’†‘Œê(Wang)"),
TEXT("UTF-1"),
TEXT("UTF-1(BOM)"),
TEXT("UTF-5"),
@@ -82,6 +88,12 @@
TEXT("Chinese(ISO-2022-CN)"),
TEXT("Chinese(HZ)"),
TEXT("Chinese(Big5)"),
TEXT("Chinese(EUC-TW/CNS)"),
TEXT("Chinese(TCA)"),
TEXT("Chinese(ETen)"),
TEXT("Chinese(IBM 5550)"),
TEXT("Chinese(Teletext)"),
TEXT("Chinese(Wang)"),
TEXT("UTF-1"),
TEXT("UTF-1(BOM)"),
TEXT("UTF-5"),
@@ -139,6 +151,12 @@
TEXT("I2CN"),
TEXT("HZ"),
TEXT("BIG5"),
TEXT("CNS"),
TEXT("TCA"),
TEXT("ETEN"),
TEXT("5550"),
TEXT("TLTX"),
TEXT("WANG"),
TEXT("UTF1"),
TEXT("UTF1"),
TEXT("UTF5"),
@@ -212,50 +230,56 @@
Enroll( IsoCN, 8 ),
Enroll( HZ , 9 );
if( ::IsValidCodePage(950) ) Enroll( Big5 , 10 );
/* if( always ) */ EnrollS( UTF1, 11 );
Enroll( UTF1Y, 12 );
Enroll( UTF5, 13 );
Enroll( UTF7, 14 );
Enroll( UTF8, 15 );
EnrollS( UTF8N, 16 );
EnrollS( UTF9, 17 );
Enroll( UTF9Y, 18 );
EnrollS( UTF16b, 19 );
EnrollS( UTF16l, 20 );
Enroll( UTF16BE, 21 );
Enroll( UTF16LE, 22 );
EnrollS( UTF32b, 23 );
EnrollS( UTF32l, 24 );
Enroll( UTF32BE, 25 );
Enroll( UTF32LE, 26 );
if( ::IsValidCodePage(850) ) Enroll( WesternDOS, 27 );
Enroll( Western, 28 );
if( ::IsValidCodePage(852) ) Enroll( CentralDOS, 29 );
if( ::IsValidCodePage(28592) ) Enroll( Central, 30 );
if( ::IsValidCodePage(855) ) Enroll( CyrillicIBM, 31 );
if( ::IsValidCodePage(866) ) Enroll( CyrillicDOS, 32 );
if( ::IsValidCodePage(28595) ) Enroll( Cyrillic, 33 );
if( ::IsValidCodePage(20866) ) Enroll( Koi8R, 34 );
if( ::IsValidCodePage(21866) ) Enroll( Koi8U, 35 );
if( ::IsValidCodePage(874) ) Enroll( Thai, 36 );
if( ::IsValidCodePage(857) ) Enroll( TurkishDOS, 37 );
if( ::IsValidCodePage(1254) ) Enroll( Turkish, 38 );
if( ::IsValidCodePage(775) ) Enroll( BalticIBM, 39 );
if( ::IsValidCodePage(1257) ) Enroll( Baltic, 40 );
if( ::IsValidCodePage(1258) ) Enroll( Vietnamese, 41 );
if( ::IsValidCodePage(737) ) Enroll( GreekIBM, 42 );
if( ::IsValidCodePage(869) ) Enroll( GreekMSDOS, 43 );
if( ::IsValidCodePage(28597) ) Enroll( Greek, 44 );
if( ::IsValidCodePage(720) ) Enroll( ArabicIBM, 45 );
if( ::IsValidCodePage(864) ) Enroll( ArabicMSDOS, 46 );
if( ::IsValidCodePage(1256) ) Enroll( Arabic, 47 );
if( ::IsValidCodePage(862) ) Enroll( HebrewDOS, 48 );
if( ::IsValidCodePage(1255) ) Enroll( Hebrew, 49 );
if( ::IsValidCodePage(860) ) Enroll( Portuguese, 50 );
if( ::IsValidCodePage(861) ) Enroll( Icelandic, 51 );
if( ::IsValidCodePage(863) ) Enroll( CanadianFrench, 52 );
if( ::IsValidCodePage(865) ) Enroll( Nordic, 53 );
Enroll( DOSUS, 54 );
if( ::IsValidCodePage(20000) ) Enroll( CNS , 11 );
if( ::IsValidCodePage(20001) ) Enroll( TCA , 12 );
if( ::IsValidCodePage(20002) ) Enroll( ETen , 13 );
if( ::IsValidCodePage(20003) ) Enroll( IBM5550, 14 );
if( ::IsValidCodePage(20004) ) Enroll( Teletext, 15 );
if( ::IsValidCodePage(20005) ) Enroll( Wang , 16 );
/* if( always ) */ EnrollS( UTF1, 17 );
Enroll( UTF1Y, 18 );
Enroll( UTF5, 19 );
Enroll( UTF7, 20 );
Enroll( UTF8, 21 );
EnrollS( UTF8N, 22 );
EnrollS( UTF9, 23 );
Enroll( UTF9Y, 24 );
EnrollS( UTF16b, 25 );
EnrollS( UTF16l, 26 );
Enroll( UTF16BE, 27 );
Enroll( UTF16LE, 28 );
EnrollS( UTF32b, 29 );
EnrollS( UTF32l, 30 );
Enroll( UTF32BE, 31 );
Enroll( UTF32LE, 32 );
if( ::IsValidCodePage(850) ) Enroll( WesternDOS, 33 );
Enroll( Western, 34 );
if( ::IsValidCodePage(852) ) Enroll( CentralDOS, 35 );
if( ::IsValidCodePage(28592) ) Enroll( Central, 36 );
if( ::IsValidCodePage(855) ) Enroll( CyrillicIBM, 37 );
if( ::IsValidCodePage(866) ) Enroll( CyrillicDOS, 38 );
if( ::IsValidCodePage(28595) ) Enroll( Cyrillic, 39 );
if( ::IsValidCodePage(20866) ) Enroll( Koi8R, 40 );
if( ::IsValidCodePage(21866) ) Enroll( Koi8U, 41 );
if( ::IsValidCodePage(874) ) Enroll( Thai, 42 );
if( ::IsValidCodePage(857) ) Enroll( TurkishDOS, 43 );
if( ::IsValidCodePage(1254) ) Enroll( Turkish, 44 );
if( ::IsValidCodePage(775) ) Enroll( BalticIBM, 45 );
if( ::IsValidCodePage(1257) ) Enroll( Baltic, 46 );
if( ::IsValidCodePage(1258) ) Enroll( Vietnamese, 47 );
if( ::IsValidCodePage(737) ) Enroll( GreekIBM, 48 );
if( ::IsValidCodePage(869) ) Enroll( GreekMSDOS, 49 );
if( ::IsValidCodePage(28597) ) Enroll( Greek, 50 );
if( ::IsValidCodePage(720) ) Enroll( ArabicIBM, 51 );
if( ::IsValidCodePage(864) ) Enroll( ArabicMSDOS, 52 );
if( ::IsValidCodePage(1256) ) Enroll( Arabic, 53 );
if( ::IsValidCodePage(862) ) Enroll( HebrewDOS, 54 );
if( ::IsValidCodePage(1255) ) Enroll( Hebrew, 55 );
if( ::IsValidCodePage(860) ) Enroll( Portuguese, 56 );
if( ::IsValidCodePage(861) ) Enroll( Icelandic, 57 );
if( ::IsValidCodePage(863) ) Enroll( CanadianFrench, 58 );
if( ::IsValidCodePage(865) ) Enroll( Nordic, 59 );
Enroll( DOSUS, 50 );
 
// I—¹
#undef Enroll
/GreenPad/kilib/textfile.cpp
@@ -822,6 +822,130 @@
if( siz == u5sum )
return UTF5;
 
//-- UTF-16/32 detection
if( freq[ 0 ] ) // nulls in content?
{ // then it may be UTF-16/32 without BOM
# ifdef UTF_DEBUG
TCHAR utfTmp[80];
# endif
// detect for UTF-16 LE
ulong x; ulong u2size=siz/2;
qbyte uchr;
ulong u16le_confidence = 0, u16le_unconfidence = 0;
int u16le_impossible = 0;
for( x=0; x < u2size; x++ )
{
uchr = ptr[x*2] | ptr[x*2+1]<<8;
if( IsNonUnicodeRange(uchr) || uchr==0 ) // \0\0 maybe a part of UTF-32
{
u16le_impossible = 1;
break;
}
if((0x00 <= uchr && uchr < 0x80)) u16le_confidence+=2; // unicode ASCII
else if(IsAscii(ptr[x*2]) && IsAscii(ptr[x*2+1])) // both char are ASCII
{
++u16le_confidence;
++u16le_unconfidence;
}
else if(IsSurrogateLead(uchr)) ++u16le_unconfidence; // Surrogate pairs are less-used
else ++u16le_confidence; // other Unicode chars
}
if( !u16le_impossible )
{
# ifdef UTF_DEBUG
::wsprintf(utfTmp,TEXT("usize=%d, confidence=%d, unconfidence=%d"),u2size,u16le_confidence,u16le_unconfidence);
::MessageBox(NULL,utfTmp,TEXT("UTF16LEDetect"),0);
# endif
if( (u16le_confidence-u16le_unconfidence) > u2size ) return UTF16LE;
}
 
// detect for UTF-16 BE
ulong u16be_confidence = 0, u16be_unconfidence = 0;
int u16be_impossible = 0;
for( x=0; x < u2size; x++ )
{
uchr = ptr[x*2+1] | ptr[x*2]<<8;
if( IsNonUnicodeRange(uchr) || uchr==0 ) // \0\0 maybe a part of UTF-32
{
u16be_impossible = 1;
break;
}
if((0x00 <= uchr && uchr < 0x80)) u16be_confidence+=2; // unicode ASCII
else if(IsAscii(ptr[x*2]) && IsAscii(ptr[x*2+1])) // both char are ASCII
{
++u16be_confidence;
++u16be_unconfidence;
}
else if(IsSurrogateLead(uchr)) ++u16be_unconfidence; // Surrogate pairs are less-used
else ++u16be_confidence; // other Unicode chars
}
if( !u16be_impossible )
{
# ifdef UTF_DEBUG
::wsprintf(utfTmp,TEXT("usize=%d, confidence=%d, unconfidence=%d"),u2size,u16be_confidence,u16be_unconfidence);
::MessageBox(NULL,utfTmp,TEXT("UTF16BEDetect"),0);
# endif
if( (u16be_confidence-u16be_unconfidence) > u2size ) return UTF16BE;
}
 
// detect for UTF-32 LE
ulong u4size=siz/4;
ulong u32le_confidence = 0, u32le_unconfidence = 0;
int u32le_impossible = 0;
for( x=0; x < u4size; x++ )
{
uchr = ptr[x*4] | ptr[x*4+1]<<8 | ptr[x*4+2]<<16 | ptr[x*4+3]<<24;
if( IsNonUnicodeRange(uchr) )
{
u32le_impossible = 1;
break;
}
if((0x00 <= uchr && uchr < 0x80)) ++u32le_confidence+=2; // unicode ASCII
else ++u32le_confidence; // other Unicode chars
if(ptr[x*4] == 0)
{
++u32le_unconfidence;
if(ptr[x*4+1] > 0x10) u32le_unconfidence+=2;
}
}
if( !u32le_impossible )
{
# ifdef UTF_DEBUG
::wsprintf(utfTmp,TEXT("usize=%d, confidence=%d, unconfidence=%d"),u4size,u32le_confidence,u32le_unconfidence);
::MessageBox(NULL,utfTmp,TEXT("UTF32LEDetect"),0);
# endif
if( u32le_confidence-u32le_unconfidence > u4size ) return UTF32LE;
}
 
// detect for UTF-32 BE
ulong u32be_confidence = 0, u32be_unconfidence = 0;
int u32be_impossible = 0;
for( x=0; x < u4size; x++ )
{
uchr = ptr[x*4+3] | ptr[x*4+2]<<8 | ptr[x*4+1]<<16 | ptr[x*4]<<24;
if( IsNonUnicodeRange(uchr) )
{
u32be_impossible = 1;
break;
}
if((0x00 <= uchr && uchr < 0x80)) ++u32be_confidence+=2; // unicode ASCII
else ++u32be_confidence; // other Unicode chars
if(ptr[x*4+3] == 0)
{
++u32be_unconfidence;
if(ptr[x*4+2] > 0x10) u32be_unconfidence+=2;
}
}
if( !u32be_impossible )
{
# ifdef UTF_DEBUG
::wsprintf(utfTmp,TEXT("usize=%d, confidence=%d, unconfidence=%d"),u4size,u32be_confidence,u32be_unconfidence);
::MessageBox(NULL,utfTmp,TEXT("UTF32BEDetect"),0);
# endif
if( u32be_confidence-u32be_unconfidence > u4size ) return UTF32BE;
}
}
 
//-- chardet and MLang detection
if( app().isNewShell() )
{ // chardet works better when size > 64
@@ -990,6 +1114,7 @@
STR2CP("Shift_JIS",SJIS)
STR2CP("EUC-JP",EucJP)
STR2CP("EUC-KR",UHC)
STR2CP("x-euc-tw",CNS)
STR2CP("Big5",Big5)
STR2CP("gb18030",GBK)
STR2CP("UTF-8",UTF8)
@@ -1011,6 +1136,52 @@
return cs;
}
 
// functions for detecting BOM-less UTF-16/32
bool TextFileR::IsNonUnicodeRange(qbyte u)
{ // Unicode 5.2 based
return (0x000840 <= u && u < 0x000900) ||
//(0x0018B0 <= u && u < 0x001900) || // U+18B0-18FF : Unified Canadian Aboriginal Syllabics Extended
(0x001AB0 <= u && u < 0x001B00) ||
(0x001BC0 <= u && u < 0x001C00) ||
(0x001C80 <= u && u < 0x001CD0) ||
//(0x002C60 <= u && u < 0x002C80) || // U+2C60-2C7F : Latin Extended-C
//(0x002DE0 <= u && u < 0x002E00) || // U+2DE0-2DFF : Cyrillic Extended-A
(0x002FE0 <= u && u < 0x002FF0) ||
//(0x00A4D0 <= u && u < 0x00A700) || // U+A4D0-A4FF,A500-A63F,A640-A69F,A6A0-A6FF : Lisu, Vai, Cyrillic Extended-B, Bamum
//(0x00A720 <= u && u < 0x00A800) || // U+A720-A7FF : Latin Extended-D
(0x00A9E0 <= u && u < 0x00AA00) || //
(0x00AAE0 <= u && u < 0x00ABC0) ||
//(0x00D7B0 <= u && u < 0x00D800) || // U+D7B0-D7FF : Hangul Jamo Extended-B
(0x010200 <= u && u < 0x01027F) || // U+10280-1029F,102A0-102DF : Lycian, Carian
(0x0102E0 <= u && u < 0x010300) || // U+10300-1032F,10330-1034F : Old Italic, Gothic
(0x010350 <= u && u < 0x010380) || // U+10380-1039F,103A0-103DF : Ugaritic, Old Persian
(0x0103E0 <= u && u < 0x010400) || // U+10400-1044F,10450-1047F,10480-104AF : Deseret, Shavian, Osmanya
(0x0104B0 <= u && u < 0x010800) || // U+10800-1083F,10840-1085F : Cypriot Syllabary, Imperial Aramaic
(0x010860 <= u && u < 0x0108FF) || // U+10900-1091F,10920-1093F : Phoenician, Lydian
(0x010940 <= u && u < 0x010A00) || // U+10A00-10A5F,10A60-10A7F : Kharoshthi, Old South Arabian
(0x010A80 <= u && u < 0x010AFF) || // U+10B00-10B3F,10B40-10B5F,10B60-10B7F : Avestan, Inscriptional Parthian, Inscriptional Pahlavi
(0x010B80 <= u && u < 0x010BFF) || // U+10C00-10C4F : Old Turkic
(0x010C50 <= u && u < 0x010E5F) || // U+10E60-10E7F : Rumi Numeral Symbols
(0x010E80 <= u && u < 0x01107F) || // U+11080-110CF : Kaithi
(0x0110D0 <= u && u < 0x011FFF) || // U+12000-123FF,12400-1247F : Cuneiform, Cuneiform Numbers and Punctuation
(0x012480 <= u && u < 0x012FFF) || // U+13000-1342F : Egyption Ideographs
(0x013430 <= u && u < 0x01D000) || // U+1D000-1D0FF,1D100-1D1FF,1D200-1D24F : Byzantine Musical Symbols, Musical Symbols, Ancient Greek Musical Notation
(0x01D250 <= u && u < 0x01D300) || // U+1D300-1D35F,1D360-1D37F : Tai Xuan Jing Symbols, Counting Rod Numerals
(0x01D380 <= u && u < 0x01D400) || // U+1D400-1D7FF : Mathematical Alphanumeric Symbols
(0x01D800 <= u && u < 0x01F000) || // U+1F000-1F02F,1F030-1F09F : Mahjong Tiles, Domino Tiles
(0x01F0A0 <= u && u < 0x01F0FF) || // U+1F100-1F1FF,1F200-1F2FF : Enclosed Alphanumeric Supplement, Enclosed Ideographic Supplement
(0x01F300 <= u && u < 0x020000) || // U+20000-2A6DF : CJK Unified Ideographs Extension B
(0x02A6E0 <= u && u < 0x02A700) || // U+2A700-2B73F : CJK Unified Ideographs Extension C
(0x02B740 <= u && u < 0x02F800) || // U+2F800-2FA1F : CJK Compatibility Ideographs Supplement
(0x02FA20 <= u && u < 0x0E0000) || // U+E0000-E007F : Tags
(0x0E0080 <= u && u < 0x0E0100) || // U+E0100-E01EF : Variation Selectors Supplement
(0x0E01F0 <= u && u < 0x0F0000) || // U+F0000-FFFFD : Supplementary Private Use Area A
(0x110000 <= u); // U+100000-10FFFD : Supplementary Private Use Area B , U+110000-FFFFFFFF non-used
}
bool TextFileR::IsAscii(uchar c) { return 0x20 <= c && c < 0x80; }
bool TextFileR::IsSurrogateLead(qbyte w) { return 0xD800 <= w && w <= 0xDBFF; }
 
 
//=========================================================================
// ƒeƒLƒXƒgƒtƒ@ƒCƒ‹o—Í‹¤’ʃCƒ“ƒ^[ƒtƒFƒCƒX
//=========================================================================
/GreenPad/kilib/textfile.h
@@ -65,6 +65,12 @@
IsoCN = -936, // ’†‘Œê‚Q (ŠÈ‘ÌŽš ISO-2022-CN)
HZ = -937, // ’†‘Œê‚R (ŠÈ‘ÌŽš HZ-GB2312)
Big5 = 950, // ’†‘Œê‚S (”É‘ÌŽš Big5)
CNS = 20000,// ’†‘Œê‚T (”É‘ÌŽš EUC-TW/CNS)
TCA = 20001,// ’†‘Œê‚U (”É‘ÌŽš TCA)
ETen = 20002,// ’†‘Œê‚V (”É‘ÌŽš ETen)
IBM5550 = 20003,// ’†‘Œê‚W (”É‘ÌŽš IBM5550)
Teletext = 20004,// ’†‘Œê‚X (”É‘ÌŽš Teletext)
Wang = 20005,// ’†‘Œê‚P‚O (”É‘ÌŽš Wang)
 
SJIS = 932, // “ú–{Œê‚P (Shift_JIS)
EucJP = -932, // “ú–{Œê‚Q (“ú–{ŒêEUC)
@@ -169,6 +175,10 @@
int MLangAutoDetection( const uchar* ptr, ulong siz );
int chardetAutoDetection( const uchar* ptr, ulong siz );
 
bool TextFileR::IsNonUnicodeRange(qbyte u);
bool TextFileR::IsAscii(uchar c);
bool TextFileR::IsSurrogateLead(qbyte w);
 
private:
 
NOCOPY(TextFileR);