JAVA串联射频技术_rf-idf的java实现-白红宇的个人博客

JAVA串联射频技术_rf-idf的java实现

发布日期：2021-08-20 05:18:44 浏览次数：58 分类：技术文章

本文共 4595 字，大约阅读时间需要 15 分钟。

package com.bobo.paper.athology;

import java.io.IOException;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.Iterator;

import java.util.List;

import java.util.Map;

import com.bobo.paper.util.CutWordsUtil;

import com.bobo.paper.util.FileUtil;public classTfIdfAthology {/**

* 统计各个词语列表中各个词语出现的次数

* @param cutwords 分词之后的词语列表

* @return 返回一个hashmap，key为词，value为词出现的次数*/

public static HashMap normalTF(ArrayListcutwords){

HashMap resTF = new HashMap();for(String word : cutwords){if(resTF.get(word) == null){

resTF.put(word,1);

System.out.println(word);

}else{

resTF.put(word, resTF.get(word) + 1);

System.out.println(word.toString());

}

}returnresTF;

}/**

* 统计词频，即tf值

* @param cutwords 分词之后的词语列表

* @return*/

public static HashMap tf(ArrayListcutwords){

HashMap resTF = new HashMap();int wordLen =cutwords.size();

HashMap intTF =normalTF(cutwords);

Iterator iter= intTF.entrySet().iterator(); //iterator for that get from TF

while(iter.hasNext()){

Map.Entry entry=(Map.Entry)iter.next();

resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString())/wordLen);

System.out.println(entry.getKey().toString() + "="+ Float.parseFloat(entry.getValue().toString()) /wordLen);

}returnresTF;

}/**

* 将以个目录下所有的文件进行分词，返回一个HashMap> ，前面一个key是文件名，后面一个key是词，其值为该词出现的次数

* @param dirc

* @return

* @throws IOException*/

public static HashMap>normalTFAllFiles(String dirc) throws IOException{

HashMap> allNormalTF = new HashMap>();

List filelist =FileUtil.readDirs(dirc);for(String file : filelist){

HashMap dict = new HashMap();

ArrayList cutwords = CutWordsUtil.cutWords(file); //get cut word for one file

dict=normalTF(cutwords);

allNormalTF.put(file, dict);

}returnallNormalTF;

}/**

* 計算一個目錄下所有文件中詞語的詞頻

* @param dirc 目錄名稱

* @return 返回一個HashMap>，第一個key是文件名，第二個key是詞，value是該詞語在該文件中的頻率

* @throws IOException*/

public static HashMap>tfAllFiles(String dirc) throws IOException{

HashMap> allTF = new HashMap>();

List filelist =FileUtil.readDirs(dirc);for(String file : filelist){

HashMap dict = new HashMap();

ArrayList cutwords = CutWordsUtil.cutWords(file); //get cut words for one file

dict=tf(cutwords);

allTF.put(file, dict);

}returnallTF;

}/**

* 計算词语的idf值 log(|D|/{包含该词语的文档个数+1})

* @param all_tf 爲HashMap>，第一個key爲文件名，第二個key爲詞語，float代表該詞語在本文件中的詞頻

* @return*/

public static HashMap idf(HashMap>all_tf){

HashMap resIdf = new HashMap();//dict的key值为词，其value为出现该词的文档个数

HashMap dict = new HashMap();int docNum =FileUtil.FileList.size();//循环所有的文件

for(int i = 0; i < docNum; i++){//all_tf中記錄的是

HashMap temp = all_tf.get(FileUtil.FileList.get(i));

Iterator iter=temp.entrySet().iterator();while(iter.hasNext()){//循环一个文件中的所有词语的词频

Map.Entry entry =(Map.Entry)iter.next();

String word=entry.getKey().toString();//IDF的公式，idfi=log(|D|/|{j:ti屬於dj}|)，其中|D|爲語料庫中的文件總數目，|{j:ti屬於dj}|指的是包含詞語ti的文件數目，如果该词语不在语料库中，就会导致被除数为零，因此一般情况下使用1 + |\{j : t_{i} \in d_{j}\}|

if(dict.get(word) == null){

dict.put(word,1);

}else{

dict.put(word, dict.get(word) + 1);

}

System.out.println("IDF for every word is:");

Iterator iter_dict=dict.entrySet().iterator();while(iter_dict.hasNext()){

Map.Entry entry=(Map.Entry)iter_dict.next();float value = (float)Math.log(docNum /Float.parseFloat(entry.getValue().toString()));

resIdf.put(entry.getKey().toString(), value);

System.out.println(entry.getKey().toString() + "=" +value);

}returnresIdf;

}/**

* 计算某个词语的idf值

* @param all_tf 记录所有词语tf值的map，第一个key为文件名，第二个key为词语

* @param idfs 记录所有词语idf值的map，key为词语*/

public static void tf_idf(HashMap> all_tf,HashMapidfs){

HashMap> resTfIdf = new HashMap>();int docNum =FileUtil.FileList.size();for(int i = 0; i < docNum; i++){

String filepath= FileUtil.FileList.get(i);

HashMap tfidf = new HashMap();

HashMap temp = all_tf.get(filepath);

Iterator iter=temp.entrySet().iterator();while(iter.hasNext()){

Map.Entry entry=(Map.Entry)iter.next();

String word=entry.getKey().toString();

Float value= (float)Float.parseFloat(entry.getValue().toString()) * idfs.get(word);

tfidf.put(word, value);

}

resTfIdf.put(filepath, tfidf);

}

System.out.println("TF-IDF for Every file is :");

DisTfIdf(resTfIdf);

}//這個主要用來顯示最終計算得到的tf-idf值

public static void DisTfIdf(HashMap>tfidf){

Iterator iter1=tfidf.entrySet().iterator();while(iter1.hasNext()){

Map.Entry entrys=(Map.Entry)iter1.next();

System.out.println("FileName:" +entrys.getKey().toString());

System.out.print("{");

HashMap temp = (HashMap) entrys.getValue();

Iterator iter2=temp.entrySet().iterator();while(iter2.hasNext()){

Map.Entry entry=(Map.Entry)iter2.next();

System.out.print(entry.getKey().toString() + "=" + entry.getValue().toString() + ",");

}

System.out.println("}");

}

转载地址：https://blog.csdn.net/weixin_33501587/article/details/114715251 如侵犯您的版权，请留言回复原文章的地址，我们会给您删除此文章，给您带来不便请您谅解！

上一篇：lableimg闪退_打标工具labelme或者labelimg遇到图片闪退的完美解决方案

下一篇：java读取4个字节_一次读取4个字节 (Reading in 4 bytes at a time)

发表评论

关于作者

喝酒易醉，品茶养心，人生如梦，品茶悟道，何以解忧？唯有杜康！

-- 愿君每日到此一游！

发表评论

最新留言

关于作者

推荐文章