1 //把训练语料库转换成crf可接受的格式
2 //author benbendy
3 //date 2009 1 2
4
5 #include<iostream>
6 #include<fstream>
7 #include<string>
8
9 using namespace std;
10
11 bool chinese(unsigned char hi,unsigned char lo)
12 {
13
14 if(lo<0x81 || lo >0xFE )
15 return false;
16 if(lo >=0xA1 && lo <= 0xA9)
17 return false;
18 if(hi <0x40 || hi ==0xFF || hi == 0x7F)
19 return false;
20 return true;
21 }
22
23 int main()
24 {
25 string str;
26 ifstream cin("in.txt");
27 ofstream cout("out.txt");
28 ofstream fout("training.txt");
29 while(getline(cin,str))
30 {
31 bool flag=true;
32 for(int i=0;i<str.size()-1;i++)
33 {
34
35 if(chinese(str[i+1],str[i]))
36 i++;
37 else if(str[i]!=' ')
38 {
39 flag=false;
40 break;
41 }
42 }
43 if(!flag)
44 continue;
45 cout<<str<<endl;
46
47 char s[3];
48 s[2]='\0';
49 bool bef=false;
50 for(int i=0;i<str.size()-1;i++)
51 {
52 if(chinese(str[i+1],str[i]))
53 {
54 if(!bef )
55 {
56 s[0]=str[i];
57 s[1]=str[i+1];
58 fout<<s<<" B"<<endl;
59 bef=true;
60 }
61 else{
62 s[0]=str[i];
63 s[1]=str[i+1];
64 fout<<s<<" I"<<endl;
65 bef=true;
66
67 }
68 i++;
69 }
70 else {
71 bef=false;
72 }
73 }
74 fout<<endl;
75 }
76
77 }
78
79
80
81
82
下面是把文本直接转换成CRF测试格式
12 int main()
13 {
14 char low=0x81;
15 char up=0xfe;
16
17 string str;
18 ifstream cin("in.txt");
19 ofstream fout("out.txt");
20 while(getline(cin,str))
21 {
22 bool flag=true;
23 for(int i=0;i<str.size()-1;i++)
24 {
25 if(str[i]>=low&&str[i]<=up)
26 {
27 string temp=str.substr(i,2);
28 i++;
29 if(temp=="("||temp==")"||temp=="。"||temp==","||temp==":"||temp=="》"||temp=="《"||temp=="、"||temp=="> !") //如果是各种标点符号或者其他符号
30 fout<<endl;
31 else fout<<temp<<endl;
32 }
33 else {
34 if(str[i]==','||str[i]=='.')
35 fout<<endl;
36 }
37 }
38 fout<<endl;
39 }
40 }