找传奇、传世资源到传世资源站!

c#版 文档相似度比较 TF*IDF 算法的实现

8.5玩家评分(1人评分)
下载后可评
介绍 评论 失效链接反馈

可直接拿来测试哦

using System;using System.Collections.Generic;using System.Text;using System.Text.RegularExpressions;namespace Test.TFIDF{ class IF_IDF { /// <summary> /// 获取拆分后的词组以及每个词的出现次数 /// </summary> /// <param name="text"></param> /// <returns></returns> public Dictionary<string, int> GetWordsFrequnce(string text) { Dictionary<string, int> dictionary = new Dictionary<string, int>(); Regex regex = new Regex(@"[\u4e00-\u9fa5]");//分拣出中文字符 MatchCollection results = regex.Matches(text); int temp; foreach (Match word in results) { if (dictionary.TryGetValue(word.Value, out temp)) { temp ; dictionary.Remove(word.Value); dictionary.Add(word.Value, temp); } else { dictionary.Add(word.Value, 1); } } return dictionary; } /// <summary> /// 文档中出现次数最多的词的出现次数 /// </summary> /// <param name="wordsfre">拆分后的词组字典</param> /// <returns></returns> public int MaxWordFrequence( Dictionary<string, int> wordsfre) { Dictionary<string, int>.ValueCollection values = wordsfre.Values; int maxfre = 0; foreach (int value in values) { if (maxfre < value) { maxfre = value; } } return maxfre; } /// <summary> /// 计算某词的IF,返回结果 /// </summary> /// <param name="wordFre"></param> /// <param name="maxFre"></param> /// <returns></returns> public double[] TF(string text) { Dictionary<string, int> dictionary = GetWordsFrequnce(text); int maxFre = MaxWordFrequence(dictionary); double[] tf = new double[dictionary.Keys.Count]; //for (int i=0; i< wordFre.Length; i ) //{ // tf[i] = wordFre[1] / maxFre; //} Dictionary<string,int>.ValueCollection values=dictionary.Values; int flag = 0; foreach(int Fre in values) { tf[flag] = Fre / maxFre; flag ; } return tf; } /// <summary> /// 计算逆向词频,返回结果 /// </summary> /// <param name="word"></param> /// <param name="text"></param> /// <returns></returns> public double[] IDF(string text,string []texts) { Dictionary<string, int> dictionary = GetWordsFrequnce(text); double[] idf = new double[dictionary.Keys.Count]; //int total_file = text.Length;//文件总数 int []file_num = new int[dictionary.Keys.Count]; //含有该词组的文件数 int flag = 0; foreach(string word in dictionary.Keys) { file_num[flag] = 0; for (int j=0; j < texts.Length; j ) { if (texts[j].Contains(word)) { file_num[flag] ; } } idf[flag] = Math.Log( texts.Length / file_num[flag],2) 1; flag ; } return idf; } /// <summary> /// 计算所有文档中的词组的权重 /// </summary> /// <param name="texts"></param> /// <returns></returns> public double [][]TF_IDF(string []texts) { double[][] tf_idf=new double[texts.Length][]; for (int i=0; i< texts.Length; i ) { double[] tf = TF(texts[i]); double[] idf = IDF(texts[i], texts); tf_idf[i] = new double[tf.Length]; for (int j = 0; j < tf.Length; j ) { tf_idf[i][j] = tf[j] * idf[j]; } } return tf_idf; } /// <summary> /// 通过传入所有文档以及要比较的两份文档的索引,计算相似度,返回结果 /// </summary> /// <param name="i">第i份文档</param> /// <param name="j">第j份文档</param> /// <param name="texts"></param> /// <returns></returns> public double Similarity(int i, int j,string []texts) { double[][] tf_idf =TF_IDF( texts); double sum=0; //两向量内积 double i_length=0; //两向量模长 double j_length = 0; //计算内积 for (int m = 0; m < tf_idf[i-1].Length;m ) { if (m >= tf_idf[j-1].Length) { break; } sum = tf_idf[i-1][m] * tf_idf[j-1][m]; } //第i份文档的向量模长 for (int n = 0; n < tf_idf[i-1].Length; n ) { i_length = tf_idf[i-1][n] * tf_idf[i-1][n]; } i_length = Math.Sqrt(i_length); // 第j份文档的向量模长 for (int n = 0; n < tf_idf[j-1].Length; n ) { j_length = tf_idf[j-1][n] * tf_idf[j-1][n]; } j_length = Math.Sqrt(j_length); //夹角余弦值计算公式,两向量内积除以两向量的模长乘积 return sum / (i_length * j_length); } }}

评论

发表评论必须先登陆, 您可以 登陆 或者 注册新账号 !


在线咨询: 问题反馈
客服QQ:174666394

有问题请留言,看到后及时答复