zoukankan      html  css  js  c++  java
  • 找出一段文字中出现次数最多的前10个单词以及次数

    1、找出一段文字中出现次数最多的前10个单词以及次数

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    
    namespace ConsoleApplication1
    {
        class Program
        {
            static void Main(string[] args)
            {
                string str = "wo men wo men wo ni hao ni hao we er ty ui o pp pp pp aa aa aa";
                Dictionary<string, int> dic = WordCount(str);
            }
    
            static Dictionary<string, int> WordCount(string mes)
            {
                Dictionary<string, int> dic = new Dictionary<string, int>();
                string[] arry = mes.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
                foreach (var item in arry)
                {
                    if (!dic.ContainsKey(item))
                    {
                        dic.Add(item, 1);
                    }
                    else
                    {
                        dic[item] = dic[item] + 1;
                    }
                }
                dic = dic.OrderByDescending(r => r.Value).ToDictionary(k => k.Key, v => v.Value);
                Dictionary<string, int> dicTemp = new Dictionary<string, int>();
                foreach (var item in dic.Take(10))
                {
                    dicTemp.Add(item.Key, item.Value);
                }
                return dicTemp;
            }
    
           
        }
    }

    2、自己练习了一下从大文件读取,统计单词重复次数

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    using System.Threading.Tasks.Dataflow;
    
    namespace readbigfile
    {
        class Program
        {
            static System.Threading.Tasks.Dataflow.BufferBlock<string> m_buffer = new System.Threading.Tasks.Dataflow.BufferBlock<string>();
            static Dictionary<string, int> dicAll = new Dictionary<string, int>();
            
            static void Main(string[] args)
            {
                string fimename = @"C:planData1G.txt";
                //文件比较大,起两个任务一个读一个处理
                //启动读任务
                Task t1 = Task.Factory.StartNew(() => ReadFile(fimename));
                //启动处理任务
                Task t2 = Task.Factory.StartNew(() => Process());
                Task.WaitAll(t1,t2);
                //排序
                Dictionary<string, int> dic = dicAll.OrderByDescending(v => v.Value).ToDictionary(k => k.Key, v => v.Value);
                //返回出现次数最多的前十个单词及出现的次数
                dic = dic.Take(10).ToDictionary(k => k.Key, v => v.Value);
            }
    
            //读文件
            public static  void ReadFile(string filename)
            {
                using (System.IO.FileStream fs = new System.IO.FileStream(filename, FileMode.Open, System.IO.FileAccess.Read))
                {
                    using (StreamReader sr = new StreamReader(fs))
                    {
                        while(!sr.EndOfStream)
                        {
                            char[] charbuffer = new char[32 * 1024 * 1024];  //32M
                            sr.ReadBlock(charbuffer, 0, charbuffer.Length);
                            m_buffer.Post(new string(charbuffer));
                            System.Threading.Thread.Sleep(1000);
                        }
                    }
                }
                m_buffer.Complete();
            }
    
            //处理,计算重复次数
            private static async void Process()
            {
                string receive = string.Empty;
                while (await m_buffer.OutputAvailableAsync())
                {
                    m_buffer.TryReceive(out receive);
                    if (string.IsNullOrEmpty(receive))
                    {
                        return;
                    }
                    string[] array = receive.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
                    foreach (var item in array)
                    {
                        if (!dicAll.ContainsKey(item))
                        {
                            dicAll.Add(item, 1);
                        }
                        else
                        {
                            dicAll[item] = dicAll[item] + 1;
                        }
                    }
                }
            }
        }
    }

    3、数据来源于多个文件,优化2,处理使用多task处理,测试结果处理时间与2相差不大,但是还是记录了一下

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    using System.Threading.Tasks.Dataflow;
    
    
    namespace readbigtext2
    {
        class Program
        {
            static System.Threading.Tasks.Dataflow.BufferBlock<string> m_buffer = new System.Threading.Tasks.Dataflow.BufferBlock<string>();
    
            static void Main(string[] args)
            {
                //数据量1G
                //1、采用单线程,读取处理在一个线程,会outOfMewmory。
                //2、两个线程,一个读取、一个处理,执行时间约14s
                //3、处理线程起10个task,经测试执行时间和一个task相差不大。
                System.Diagnostics.Stopwatch st = new System.Diagnostics.Stopwatch();
                st.Start();
                string fimepath = @"C:Usersxiaochun-zhaiDocumentsigtext";
                Dictionary<string, int> dic = new Dictionary<string, int>();
                Task t1 = new Task(() => ReadFile(fimepath));
                t1.Start();
                List<Task<Dictionary<string, int>>> _list = new List<Task<Dictionary<string, int>>>();
                for (int i = 0; i < 10; i++)
                {
                    _list.Add(Process());
                }
    
                Task.WaitAll(_list.ToArray());
    
                foreach (var item in _list)
                {
                    foreach (var row in item.Result)
                    {
                        if (!dic.ContainsKey(row.Key))
                            dic.Add(row.Key, row.Value);
                        else
                            dic[row.Key] += row.Value;
                    }
                }
                 
                //排序
                Dictionary<string, int> dicR = dic.OrderByDescending(v => v.Value).ToDictionary(k => k.Key, v => v.Value);
                //返回出现次数最多的前十个单词及出现的次数
                dicR = dicR.Take(10).ToDictionary(k => k.Key, v => v.Value);
    
                st.Stop();
                Console.WriteLine(st.ElapsedMilliseconds);
                
                Console.ReadLine();
            }
           
            public static void ReadFile(string filepath)
            {
                try
                {
                    DirectoryInfo dif = new DirectoryInfo(filepath);
                    FileInfo[] info=dif.GetFiles();
                    foreach (FileInfo item in info)
                    {
                        using (System.IO.FileStream fs = new System.IO.FileStream(item.FullName, FileMode.Open, System.IO.FileAccess.Read))
                        {
                            using (StreamReader sr = new StreamReader(fs))
                            {
                                while (!sr.EndOfStream)
                                {
                                    char[] charbuffer = new char[32 * 1024];  //32M
                                    sr.ReadBlock(charbuffer, 0, charbuffer.Length);
                                    m_buffer.Post(new string(charbuffer).Trim());
                                }
                            }
                            Console.WriteLine(item.FullName);
                        }
                    }
                    m_buffer.Complete();
                }
                catch (Exception ex)
                {
                    throw ex;
                }
               
            }
    
            private static async Task<Dictionary<string, int>> Process()
            {
               
                string receive = string.Empty;
                Dictionary<string, int> dicAll = new Dictionary<string, int>();
                try
                {
                    while (await m_buffer.OutputAvailableAsync())
                    {
                        m_buffer.TryReceive(out receive);
                        if (!string.IsNullOrEmpty(receive))
                        {
                            string[] array = receive.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
                            foreach (var item in array)
                            {
                                if (!dicAll.ContainsKey(item))
                                {
                                    dicAll.Add(item, 1);
                                }
                                else
                                {
                                    dicAll[item] = dicAll[item] + 1;
                                }
                            }
                        }
                    }
                }
                catch(Exception ex)
                {
                    throw ex;
                }
                return dicAll;
            }
        }
    }

    4、如果文件比较大,不能一次读入内存,那就需要使用归并排序了。

    5、扩展

    统计大文件里,频率最高的10个单词,(C# TPL DataFlow版)

  • 相关阅读:
    sql 临时表循环更新月租金
    董事长审核租金异常处理备份
    datetable导出成Excel
    DateTable导出添加时间段
    button 美化
    JS计算两日期之间相差的月份
    刚做的JS,备份一下(空代表格计算)
    Windows 框架基础开发流程
    照片切换
    Sql datetime类型数据默认1900
  • 原文地址:https://www.cnblogs.com/xiaochun126/p/5034685.html
Copyright © 2011-2022 走看看