- // word.cpp : Defines the entry point for the console application.
- //
- #include "stdafx.h"
- // mympseg.cpp : 定义控制台应用程序的入口点。
- #include <iostream>
- #include <string>
- #include <list>
- #include <map>
- using namespace std;
- const int NODENUM=100;//最大节点数
- map <string,float> mapWord2Prob;
- const int MaxWordLength=8;//词的最大长度
- const int MinWordLength=2;//词的最小长度
- float prob[NODENUM];//每个节点的最大累计概率
- int prevNode[NODENUM];//每个节点的最优前趋节点
- const string strDelimiter=" ";//分词分割符
- void LoadDict()
- {
- mapWord2Prob["有"]=0.018;
- mapWord2Prob["有意"]=0.0005;
- mapWord2Prob["意见"]=0.001;
- mapWord2Prob["见"]=0.0002;
- mapWord2Prob["分歧"]=0.0001;
- }
- //词作为边
- struct edgeNode
- {
- string termText;//词
- int start;//词的开始位置
- int end;//词的结束位置
- float prob;//词在语料库中出现的频率或者概率
- };
- //分割位置作为点
- struct vexNode
- {
- int nSegNo; //分割节点编号
- list <edgeNode> linkedlist; //与之相连的链表head
- };
- //存储节点信息
- vexNode adjlist[NODENUM];
- //建立邻接表存储
- void InitialGraph()
- {
- int i=0;
- for(i=0;i<NODENUM;i++)
- {
- adjlist[i].nSegNo=i;
- }
- }
- //插入一条边
- void InsertEdge(edgeNode & newEdgeNode)
- {
- int v1=newEdgeNode.end;
- adjlist[v1].linkedlist.push_front(newEdgeNode);
- }
- //形成切分词图
- void CreateWordSegGraph(string s)
- {
- short i=0;
- short j=0;
- short len=0;
- short restlen=0;
- short n=s.length();
- float freq=0;
- string w;
- for(j=0;j<n;j+=2)
- {
- for(len=2;len<=MaxWordLength;len+=2)
- {
- restlen=n-j;
- if (len<=restlen) // 如果剩余词长度不够长,跳出循环
- {
- w=s.substr(j,len);
- }
- else
- {
- break;
- }
- if (mapWord2Prob.find(w) != mapWord2Prob.end())
- {
- freq=mapWord2Prob[w];
- }
- else
- {
- freq=100;
- }
- if(freq != 100)
- {
- edgeNode stCandidateWord;
- stCandidateWord.termText=w;
- stCandidateWord.start=j/MinWordLength;
- stCandidateWord.end=(j+len)/MinWordLength;
- stCandidateWord.prob = freq;
- cout<<"插入一条边 word "<<w<<" freq "<<stCandidateWord.prob<<" start "<<stCandidateWord.start<<" end "<<stCandidateWord.end<<endl;
- InsertEdge(stCandidateWord);
- }
- }
- }
- }
- //寻找i的最佳前趋词
- void getPrev(int i)
- {
- vexNode oVexNode=adjlist[i];
- edgeNode oEdgeNode;
- list <edgeNode>::iterator itList;
- //得到前驱词的集合
- float maxProb = 0;
- int maxNode = -1;
- //根据前驱词集合挑选最佳前趋节点
- for(itList=oVexNode.linkedlist.begin(); itList!=oVexNode.linkedlist.end();itList++ )
- {
- float nodeProb = prob[itList->start]+itList->prob;//候选节点概率
- cout<<"搜索结点 "<<i<<" 其前趋累计概率 "<<nodeProb<<" 开始位置 "<<itList->start<<endl;
- if (nodeProb > maxProb)
- {
- //候选节点概率最大的开始节点就是最佳前趋节点
- maxNode = itList->start;
- maxProb = nodeProb;
- }
- }
- prob[i] = maxProb;//节点概率
- prevNode[i] = maxNode;//最佳前驱节点
- cout<<"节点"<<i<<"的最佳前驱节点 "<<maxNode<<" 概率"<< maxProb<<endl;
- }
- //回溯求最佳分割点
- string BestSeg(string strSequence)
- {
- string strSeg="";
- int nLeft=0;
- int nRight=0;
- for(int i=strSequence.length()/MinWordLength; i>0;i=prevNode[i])
- {
- cout<<"最佳路径结点 "<<i<<endl;
- nRight=i;
- nLeft=prevNode[i];
- cout<<"词 "<<strSequence.substr(nLeft*MinWordLength,(nRight-nLeft)*MinWordLength)<<endl;
- strSeg=strSequence.substr(nLeft*MinWordLength,(nRight-nLeft)*MinWordLength)+strDelimiter+strSeg;
- }
- return strSeg;
- }
- //int main(int argc, char * argv[])
- int _tmain(int argc, _TCHAR* argv[])
- {
- LoadDict();
- InitialGraph();
- for (int i=0;i<NODENUM;i++)
- {
- prob[i]=0;
- prevNode[i]=0;
- }
- string strSequence="有意见分歧";
- //构建邻接表表示的切分词图,(有向图)
- CreateWordSegGraph(strSequence);
- cout<<"寻找最佳前趋__________________________"<<endl;
- for (int i=1;i<=strSequence.length()/MinWordLength;i++)
- {
- getPrev(i);
- }
- cout<<"最佳切分路径 "<<endl;
- cout<<"分词结果 "<<BestSeg(strSequence)<<endl;
- int m = 0;
- cin>>m;
- return 0;
- }
- //该片段来自于http://www.codesnippet.cn/detail/040520149429.html
来源: http://www.codesnippet.cn/detail/040520149429.html