function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding; var len: Longint; buf: string; //array[1..4096] of Char;
function Maybe3BytesUtf8(Index: Integer): Boolean; begin Result := (Index + 2 <= len) and (buf[Index] in [#$E0..#$EF]) and (buf[Index +1] in [#$80..#$BF]) and (buf[Index +2] in [#$80..#$BF]); end;
var idx: Longint; iUtf8: Longint; maybeGB: Integer; //GB2312/GBK/GB18030 mayBig5: Integer; //Big5 mayUtf8: Integer; //Utf-8 maybeLE: Integer; //Unicode 16 (UCS2) , Little Endian maybeBE: Integer; //Unicode 16 (UCS2) , Big Endian ratio: Integer; begin Result := ceAnsi; maybeGB := 0; mayBig5 := 0; mayUtf8 := 0; maybeLE := 0; maybeBE := 0; SetLength(buf, SamplingSize); len := AStream.Read(buf[1], Length(buf)); idx := 1; while idx <= len do begin case buf[idx] of #0: begin if (idx mod 2) = 0 then begin Inc(maybeLE); end else begin Inc(maybeBE); end; end; #$80: begin iUtf8 := idx; Inc(iUtf8); if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8); if Maybe3BytesUtf8(iUtf8) then begin Inc(mayUtf8, 32); end; end; #$81..#$BF: begin if buf[idx] in [#$81..#$A0] then begin Inc(maybeGB, 8); end; Inc(maybeGB, 8); Inc(mayBig5, 8); Inc(idx); iUtf8 := idx; if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8); if Maybe3BytesUtf8(iUtf8) then begin Inc(mayUtf8, 32); end; end; #$C0..#$DF: begin if (idx < len) and (buf[idx +1] in [#$80..#$BF]) then begin Inc(mayUtf8); if (buf[idx +1] in [#$A1..#$BF]) then begin Inc(maybeGB); Inc(mayBig5); end else begin Inc(maybeGB, 4); end; end; Inc(idx); end; #$E0..#$EF: begin if (idx + 2 <= len) and (buf[idx +1] in [#$80..#$BF]) and (buf[idx +2] in [#$80..#$BF]) then begin Inc(mayUtf8, 32); end; Inc(idx); end; #$F0..#$FE: begin if buf[idx] in [#$FA..#$FE] then begin Inc(maybeGB, 8); end; Inc(maybeGB, 8); Inc(mayBig5, 8); Inc(idx); end; end; Inc(idx); end; // set encoding if (maybeLE > 0) or (maybeBE > 0) then begin if maybeLE >= maybeBE then begin Result := ceUcs2_LE; end else begin Result := ceUcs2_BE; end; end else if (maybeGB >= mayUtf8) or (mayBig5 >= mayUtf8) then begin ratio := (maybeGB - mayBig5) * 100 div Max(1, Max(maybeGB, mayBig5)); if ratio <= 5 then begin if SeeGBBig5 then begin Result := TryToDistinguishGBOrBig5(Copy(buf, 1, len)); end; end else begin if ratio > 0 then begin Result := ceGB; end else begin Result := ceBig5; end; end; end else if mayUtf8 > 0 then begin Result := ceUtf_8; end; end;
请仔细看这段代码,并将它与下面关于编码的说明对比: {---------- Character Encoding ----------
function MyCompareChineseStr(const s1, s2: string): Boolean; var difCount: Integer; i, k: Integer; cmpLen: Integer; begin difCount := 0; i := 1; k := 1; while (i <= Length(s1)) and (k <= Length(s2)) do begin if s1[i] <> s2[k] then begin if (i +2 <= Length(s1)) and (k +2 <= Length(s2)) then begin //比较原理:两次转换后,某些字可能会转戌一个'?',以下处理这种情况 if (s1[i +1] = s2[k]) and (s1[i +2] = s2[k +1]) then begin Inc(i); end else if (s1[i] = s2[k +1]) and (s1[i +1] = s2[k +2]) then begin Inc(k); end else begin Inc(difCount); end; end else begin Inc(difCount); end; end; Inc(i); Inc(k); end; if ExactCompare then begin cmpLen := CountChineseChars(S); end else begin cmpLen := Length(S); end; Result := difCount * 100 div Max(1, cmpLen) <= 6; // different <= 6% end;
begin Result := ceAnsi; if SysLocale.PriLangID = LANG_CHINESE then begin case SysLocale.SubLangID of SUBLANG_CHINESE_SIMPLIFIED, SUBLANG_CHINESE_SINGAPORE: begin if not MyCompareChineseStr(S, Big52GBProc(GB2Big5Proc(S))) then begin Result := ceBig5; end; end; SUBLANG_CHINESE_TRADITIONAL, SUBLANG_CHINESE_HONGKONG: begin if not MyCompareChineseStr(S, GB2Big5Proc(Big52GBProc(S))) then begin Result := ceGB; end; end; end; end; end;
Big52GBProc/ GB2Big5Proc是两个函数指针,分别用于繁体转简体和简体转繁体。 ExactCompare是一个单元内的全局变量,用于适应不同简繁体转换函数。 CountChineseChars用于统计中文字符个数,具体实现如下: { Count Chinese Characters } function CountChineseChars(const S: string): Integer; var i: Integer; begin Result := 0; i := 1; while i <= Length(S) do begin if S[i] > #$80 then begin Inc(Result, 2); Inc(i, 2); end else begin Inc(i); end; end; end;