盒子 - 文本文件的智能识别方法

			捐赠 \| 广告 \| 注册 \| 发布 \| 上传 \| 关于我们
		粤ICP备10103342号-1	DELPHI盒子 \| 盒子文章 \| 盒子问答悬赏 \| 最新更新 \| 盒子检索 \| 下载中心 \| 高级搜索
		粤ICP备10103342号-1	精品专区 \| 繁體中文 \| 奖励公告栏 \| 直通车账号登陆 \| 关闭GOOGLE广告 \| 临时留言

盒子资源分类

全部展开 - 全部合拢

盒子资源分类大树

文本文件的智能识别方法

关键字：文本智能识别 GB Big5
来　自：原创
平　台：Win2k/XP/NT,Win2003	下载所需：0 火柴
深浅度：中级	完成时间：2012/3/13
发布者：iamdream (奖励50火柴)	发布时间：2012/3/13
编辑器：DELPHI5	语　　种：简体中文
分　类：应用软件	下载浏览：459/13377

加入到我的收藏

下载错误报错

登陆以后才能下载

用户名：

密　码：

自动登陆(30天有效)

    近日整理自己写的文本编辑器时，觉得不能自动识别文本编码用起来很不方便，于是研究了一下文本文件的编码方式，发现如果文件中有BOM(Byte Order Mark)头，那识别起来简单；如果没有BOM头，就没那么好处理了。
    用Google搜索了老半天，自己也苦思冥想，发现大致可以划分为以下几种情况(这里只考虑中英文)：
    1、有BOM头的文件
    这个处理起来比较简单，只要识别出各自的编码方式，然后出加以转换就可以了。
    2、没有BOM头的UCS2(UTF-16)文件
    这种类型的文件如果其中含有Ascii码字符，那也可以用比较简单的方式加以判断：看看文件中是否有为0的字符，如果有，基本上可以认为是UCS2了。当然，文件中有错，或没有Ascii码字符，那就没法判断了，因为UCS2(UTF-16)用的编码是0x0000-0xFFFF，所以难以从字符特征上来判断。
    3、没有BOM头的UTF-8编码的文件
    由于UTF-8编码有一定的特征，如下面的说明：
UTF-8
  Ascii Chars:
  00-7F          // 1 Bytes = 0xxxxxxx
  Multi Bytes:
  C0-DF + 80-BF          // 2 Bytes = 110xxxxx 10xxxxxx
  E0-EF + 80-BF + 80-BF          // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx
  F0-F7 + 80-BF + 80-BF + 80-BF  // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    所以可以从编码特征上来考虑。
    4、没有BOM头的ANSI文件
    此类文件里可能有Ascii字符，也可能有GB2312/GBK/GB18030/Big5的中文字符。单纯从文件上来看，它与没有BOM头的UTF-8编码文件类似，所以必须考虑如何与UTF-8文件区分开来。
    此外，还有不常用的UCS4(UTF-32)，这里就不考虑了。GB18030是与GBK兼容的，这里也只考虑它的2字节部分，4字节部分较少用到，不作考虑。

    通过以上分析，可以看出，现在的主要问题是如何区分没有BOM头的UTF-8文件和ANSI文件。进一步，还有如何ANSI文件判断是简体还是繁体的问题。
    以下代码是我想出来的一个猜测文本文件编码的方法，这可是原创哦(代码是Delphi的)：
type
  TCharEncoding = (ceAnsi, ceUtf_8, ceUcs2_LE, ceUcs2_BE, ceUtf_32, ceGB, ceBig5);

function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding;
var
  len: Longint;
  buf: string; //array[1..4096] of Char;

  function Maybe3BytesUtf8(Index: Integer): Boolean;
  begin
    Result := (Index + 2 <= len) and (buf[Index] in [#$E0..#$EF]) and
      (buf[Index +1] in [#$80..#$BF]) and (buf[Index +2] in [#$80..#$BF]);
  end;

var
  idx: Longint;
  iUtf8: Longint;
  maybeGB: Integer;   //GB2312/GBK/GB18030
  mayBig5: Integer;   //Big5
  mayUtf8: Integer;   //Utf-8
  maybeLE: Integer;   //Unicode 16 (UCS2) , Little Endian
  maybeBE: Integer;   //Unicode 16 (UCS2) , Big Endian
  ratio:   Integer;
begin
  Result := ceAnsi;
  maybeGB   := 0;
  mayBig5   := 0;
  mayUtf8   := 0;
  maybeLE   := 0;
  maybeBE   := 0;
  SetLength(buf, SamplingSize);
  len := AStream.Read(buf[1], Length(buf));
  idx := 1;
  while idx <= len do begin
    case buf[idx] of
      #0: begin
        if (idx mod 2) = 0 then begin
          Inc(maybeLE);
        end else begin
          Inc(maybeBE);
        end;
      end;
      #$80: begin
        iUtf8 := idx;
        Inc(iUtf8);
        if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
        if Maybe3BytesUtf8(iUtf8) then begin
          Inc(mayUtf8, 32);
        end;
      end;
      #$81..#$BF: begin
        if buf[idx] in [#$81..#$A0] then begin
          Inc(maybeGB, 8);
        end;
        Inc(maybeGB, 8);
        Inc(mayBig5, 8);
        Inc(idx);
        iUtf8 := idx;
        if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
        if Maybe3BytesUtf8(iUtf8) then begin
          Inc(mayUtf8, 32);
        end;
      end;
      #$C0..#$DF: begin
        if (idx < len) and (buf[idx +1] in [#$80..#$BF]) then begin
          Inc(mayUtf8);
          if (buf[idx +1] in [#$A1..#$BF]) then begin
          Inc(maybeGB);
          Inc(mayBig5);
          end else begin
          Inc(maybeGB, 4);
          end;
        end;
        Inc(idx);
      end;
      #$E0..#$EF: begin
        if (idx + 2 <= len) and (buf[idx +1] in [#$80..#$BF]) and (buf[idx +2] in [#$80..#$BF]) then begin
          Inc(mayUtf8, 32);
        end;
        Inc(idx);
      end;
      #$F0..#$FE: begin
        if buf[idx] in [#$FA..#$FE] then begin
          Inc(maybeGB, 8);
        end;
        Inc(maybeGB, 8);
        Inc(mayBig5, 8);
        Inc(idx);
      end;
    end;
    Inc(idx);
  end;
  // set encoding
  if (maybeLE > 0) or (maybeBE > 0) then begin
    if maybeLE >= maybeBE then begin
      Result := ceUcs2_LE;
    end else begin
      Result := ceUcs2_BE;
    end;
  end else if (maybeGB >= mayUtf8) or (mayBig5 >= mayUtf8) then begin
    ratio := (maybeGB - mayBig5) * 100 div Max(1, Max(maybeGB, mayBig5));
    if ratio <= 5 then begin
      if SeeGBBig5 then begin
        Result := TryToDistinguishGBOrBig5(Copy(buf, 1, len));
      end;
    end else begin
      if ratio > 0 then begin
        Result := ceGB;
      end else begin
        Result := ceBig5;
      end;
    end;
  end else if mayUtf8 > 0 then begin
    Result := ceUtf_8;
  end;
end;

    请仔细看这段代码，并将它与下面关于编码的说明对比：
{---------- Character Encoding ----------

1. UTF-8
  Ascii Chars:  00-7F          // 1 Bytes = 0xxxxxxx
  Multi Bytes:  C0-DF + 80-BF          // 2 Bytes = 110xxxxx 10xxxxxx
          E0-EF + 80-BF + 80-BF          // 3 Bytes = 1110xxxx 10xxxxxx 10xxxxxx
          F0-F7 + 80-BF + 80-BF + 80-BF  // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

2. Unicode 16 (UCS2) [as UTF-16 for 96.9%]
  2 Bytes Characters:        0000-FFFF

3. UTF-16
          0000-D7FF     =  0000-D7FF          // 2 Bytes
          E000-FFFF     =  E000-FFFF          // 2 Bytes
          10000-10FFFF  =  D800-D8FF + DC00-DCFF  // 4 Bytes
          D800-DFFF          // Surrogate

4. UTF-32  = 32 bit unsigned integer of character

5. GB2312-80
  Ascii Chars:          00-7F
  Simplified Chinese Chars:  A1-F7 + A1-FE

6. Big5
  Ascii Chars:          00-7F
  Traditional Chinses Chars: A1-F9 + 40-7E
          A1-F9 + A1-FE

7. GBK
  Ascii Chars:          00-7F
  Chinses Chars:          81-FE + 40-7E
          81-FE + 80-FE

8. GB18030-2000
  Ascii Chars:          00-7F
  Chinese Chars (2 Bytes):   81-FE + 40-7E
          81-FE + 80-FE
  Chinese Chars (4 Bytes):   81-FE + 30-39 + 81-FE + 30-39

----------}

    看出来了吗？这个函数利用UTF-8的编码特征和GB/Big5的编码范围，再加上权值，来判断是UTF-8还是ANSI文档(UCS2(UTF-16)的判断只是附带的，可能也不是很准确)。
    以我的测试来看，用这种方法可以比较准确地区分UTF-8和ANSI编码的文件。
    等等，还有一个
TryToDistinguishGBOrBig5函数的调用，相比较而言，这个函数的实现才是一个高潮部分，哈。
    如果你在网上搜索，如何判断文档是GB还是Big5，估计你得到的最靠谱的答案是：利用常用词库。我也曾想过用这种方法，结果一查，好家伙，常用词库少说也得好几万个，如果加上现在层出不穷的网络词语，估计光是词库就够忙一阵的了。
    不知道在哪个Action Script的论坛上，我看到一个建议，说是先将文本尝试着转换一下，但也就一句话，具体怎么做还是没答案。我尝试了一些方法之后，忽然想到：在简体中文系统下，是否可以将文本先转换成繁体，然后再转回成简体，再分析转换前后有什么不同，如果不同的地方超过一定比例，则认为是繁体，否则就是简体？立即动手写了一些代码，就是上述的
TryToDistinguishGBOrBig5，试了一下，哈，还真的行啊。
function TryToDistinguishGBOrBig5(const S: string): TCharEncoding;

  function MyCompareChineseStr(const s1, s2: string): Boolean;
  var
    difCount: Integer;
    i, k: Integer;
    cmpLen: Integer;
  begin
    difCount := 0;
    i := 1;
    k := 1;
    while (i <= Length(s1)) and (k <= Length(s2)) do begin
      if s1[i] <> s2[k] then begin
        if (i +2 <= Length(s1)) and (k +2 <= Length(s2)) then begin
          //比较原理：两次转换后，某些字可能会转戌一个'?'，以下处理这种情况
          if (s1[i +1] = s2[k]) and (s1[i +2] = s2[k +1]) then begin
          Inc(i);
          end else if (s1[i] = s2[k +1]) and (s1[i +1] = s2[k +2]) then begin
          Inc(k);
          end else begin
          Inc(difCount);
          end;
        end else begin
          Inc(difCount);
        end;
      end;
      Inc(i);
      Inc(k);
    end;
    if ExactCompare then begin
      cmpLen := CountChineseChars(S);
    end else begin
      cmpLen := Length(S);
    end;
    Result := difCount * 100 div Max(1, cmpLen) <= 6; // different <= 6%
  end;

begin
  Result := ceAnsi;
  if SysLocale.PriLangID = LANG_CHINESE then begin
    case SysLocale.SubLangID of
      SUBLANG_CHINESE_SIMPLIFIED,
      SUBLANG_CHINESE_SINGAPORE: begin
        if not MyCompareChineseStr(S, Big52GBProc(GB2Big5Proc(S))) then begin
          Result := ceBig5;
        end;
      end;
      SUBLANG_CHINESE_TRADITIONAL,
      SUBLANG_CHINESE_HONGKONG: begin
        if not MyCompareChineseStr(S, GB2Big5Proc(Big52GBProc(S))) then begin
          Result := ceGB;
        end;
      end;
    end;
  end;
end;


Big52GBProc/
GB2Big5Proc是两个函数指针，分别用于繁体转简体和简体转繁体。
    ExactCompare是一个单元内的全局变量，用于适应不同简繁体转换函数。
    CountChineseChars用于统计中文字符个数，具体实现如下：
{ Count Chinese Characters }
function CountChineseChars(const S: string): Integer;
var
  i: Integer;
begin
  Result := 0;
  i := 1;
  while i <= Length(S) do begin
    if S[i] > #$80 then begin
      Inc(Result, 2);
      Inc(i, 2);
    end else begin
      Inc(i);
    end;
  end;
end;

    可能是史上最强的判断GB/Big5编码的方法诞生了！也许这个方法早就有人发现了，只是没公布出来。用这个方法，我写了个简单的文本测试工具，将我手头能找到的GB/Big5文件找出来试了，还没发现不能识别的呢。
    当然，这个方法只是取文件开头的一部分(默认是4K)来作样本，所以并不能识别所有文件，不过，对通常的应用足够了。
    完整及最新修改后的代码请参考我的博客：
http://dreamisx.blog.163.com/blog/static/11500483920122134947776/
http://dreamisx.blog.163.com/blog/static/115004839201221353641643/
    最后说一句，如果你有什么更好的方法或改进，别忘了告诉我啊。共享万岁！

本站原创作品，未经作者许可，严禁任何方式转载；转载作品，如果侵犯了您的权益，请联系我们！

龙脉加密锁 15元起

Grid++Report 报表

申请支付@网

没有相关文章

			捐赠 \| 广告 \| 注册 \| 发布 \| 上传 \| 关于我们
		粤ICP备10103342号-1	DELPHI盒子 \| 盒子文章 \| 盒子问答悬赏 \| 最新更新 \| 盒子检索 \| 下载中心 \| 高级搜索
		粤ICP备10103342号-1	精品专区 \| 繁體中文 \| 奖励公告栏 \| 直通车账号登陆 \| 关闭GOOGLE广告 \| 临时留言


		DELPHI盒子版权所有技术支持:深圳市麟瑞科技有限公司 1999-2024 V4.01 粤ICP备10103342号-1