zoukankan      html  css  js  c++  java
  • 【.Net】 大文件可使用的文本分组统计工具(附带源码,原创)

    本工具可实现的效果:

    1.读取大文件(大于1GB)

    2.根据分隔符分割后的列分组

    3.速度快。

    4.处理过程中,可以随时停止处理,操作不卡死。

    5.有对当前内存的实时监测,避免过多占用内存,影响系统运行。

    6.实时显示处理的行数。

    处理类代码:

    using System;
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.IO;
    using System.Linq;
    using System.Text;
    
    namespace DaZhongLogTool
    {
    
        // 定义事件的参数类
        public class ValueEventArgs : EventArgs
        {
            public int Value { set; get; }
        }
        // 定义事件使用的委托
        public delegate void ValueChangedEnentHandler(object sender, ValueEventArgs e);
    
    
        public class BigFileTongJiJobs
        {
            long ALLOW_MAX_USED_MEMORY = 1024 * 1024 * 1024;  //允许使用的最大内存,超过则结束
    
            public bool StartFlag { get; set; }
    
            // 定义一个事件来提示界面工作的进度
            public event ValueChangedEnentHandler ValueChanged;
    
            public void OnValueChange(ValueEventArgs e)
            {
                if (ValueChanged != null)
                {
                    ValueChanged(this, e);
                }
            }
    
            /// <summary>
            /// 
            /// </summary>
            /// <param name="paramsInfo"></param>
            /// <returns>-1:未开始,就失败了;-2:文件不存在;-3,异常;大于0,处理成功</returns>
            public int StartAnalyseBigFile(TongjiParamsInfoStruct paramsInfo)
            {
                int handleLine = -1;
                
                string sTmpFile = paramsInfo.outputPath;
                if (File.Exists(sTmpFile))
                {
                    File.Delete(sTmpFile);
                }
    
                if (!System.IO.File.Exists(sTmpFile))
                {
                    FileStream fs;
                    fs = File.Create(sTmpFile);
                    fs.Close();
                }
    
                if (!File.Exists(paramsInfo.inputPath))
                {                  
                    handleLine = -2;
                    return handleLine;
                }
    
                FileStream streamInput = System.IO.File.OpenRead(paramsInfo.inputPath);
                FileStream streamOutput = System.IO.File.OpenWrite(sTmpFile);
    
                int iRowCount = 0;
                int iRowCharCount = 0;
                List<byte> rowByteData = new List<byte>();//行字节List
                Dictionary<string, int> tongjiDict = new Dictionary<string, int>(); //统计字典
                string rowStr = "";
    
                //获取当前进程对象
                Process cur = Process.GetCurrentProcess();
                //为获取当前进程使用的内存大小做准备
                PerformanceCounter curpc = new PerformanceCounter("Process", "Working Set", cur.ProcessName);
                string memoryUsedSize = "";
                try
                {
                    ValueEventArgs e;
                    int result;
    
                    //根据当前进程使用内存的大小,决定是否继续分析日志文本
                    memoryUsedSize = string.Format("分析开始,本进程使用内存大小:{0} KB,Date:{1}", curpc.NextValue() / 1024, DateTime.Now);
                    streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes(memoryUsedSize), 0, System.Text.UTF8Encoding.UTF8.GetBytes(memoryUsedSize).Length);
                    streamOutput.WriteByte(13); //换行符
    
    
                    while ((result = streamInput.ReadByte()) != -1)
                    {
    
                        if (StartFlag == false)
                        {
                            streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes("强制停止分析"), 0, System.Text.UTF8Encoding.UTF8.GetBytes("强制停止分析").Length);
                            streamOutput.WriteByte(13); 
                            break;
                        }
    
                        if (result == 10)
                            continue;
                        iRowCharCount++;
                        rowByteData.Add((byte)result);
                        if (result == 13)  //一行
                        {
    
                            //写入一次或者处理一次
                            rowStr = GetSpecificInfoFromLineText(rowByteData, paramsInfo.separator, paramsInfo.columnNum);
                            if (!string.IsNullOrEmpty(rowStr))
                            {
                                rowStr = rowStr.Length > 300 ? rowStr.Substring(0, 300) : rowStr;
                                if (tongjiDict.ContainsKey(rowStr))
                                    tongjiDict[rowStr] = tongjiDict[rowStr] + 1;
                                else
                                    tongjiDict[rowStr] = 1;
                            }                     
    
                            if (iRowCount % 10000 == 0 || iRowCount<100) //不频繁的更新UI可以极大的提高处理的效率,如果每条都更新UI,将会非常慢
                            {
                                //占用内存大于1GB,则结束本次的分析
                                if (curpc.NextValue() > ALLOW_MAX_USED_MEMORY)//当前进程使用内存的大小大于1个GB,停止分析
                                {
                                    break;
                                }
    
                                e = new ValueEventArgs() { Value = iRowCount };
                                this.OnValueChange(e);
                            }
    
                            iRowCount++; //统计处理的行数
                            iRowCharCount = 0;//本行的字符数
                            rowByteData.Clear();//清空本行数据
                        }
                    }
    
                    if (tongjiDict.Count> 1)
                    {
                        //根据当前进程使用内存的大小,决定是否继续分析日志文本
                        memoryUsedSize = string.Format("分析结束:本进程使用内存大小:{0} KB,Date:{1},分组个数:{2}", curpc.NextValue() / 1024, DateTime.Now,tongjiDict.Count);
                        streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes(memoryUsedSize), 0, System.Text.UTF8Encoding.UTF8.GetBytes(memoryUsedSize).Length);
                        streamOutput.WriteByte(13); //换行符
                    }
    
                    streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes("本次处理的文本对象是"), 0, System.Text.UTF8Encoding.UTF8.GetBytes("本次处理的文本对象是").Length);
                    streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes(paramsInfo.inputPath), 0, System.Text.UTF8Encoding.UTF8.GetBytes(paramsInfo.inputPath).Length);
                    streamOutput.WriteByte(13);
                    
                    string temLine;
                    foreach (var item in tongjiDict.OrderByDescending(t => t.Value))
                    {
                        temLine = string.Format("统计次数Value:{0}	 Key: {1}", item.Value, item.Key);
                        streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes(temLine), 0, System.Text.UTF8Encoding.UTF8.GetBytes(temLine).Length);
                        streamOutput.WriteByte(13); //换行符
                    }
                    //更新处理到最后一条的文字提示状态
                    e = new ValueEventArgs() { Value = iRowCount };
                    this.OnValueChange(e);
                }
                finally
                {
                    streamInput.Dispose();
                    streamOutput.Dispose();
                }
    
                return iRowCount;
            }
    
            //从文本行中提取特定信息
            private string GetSpecificInfoFromLineText(List<byte> lineArr, string separator, int columnNum)
            {
                string result;
                try
                {
                    string lineStr;
                    string[] columnArr;
                    lineStr = System.Text.UTF8Encoding.UTF8.GetString(lineArr.ToArray());
                    //把文本中的 "	",替换为分隔符 "\t",原因是:输入的分隔符是:“	”,为了避免被转移,系统自动把输入的分隔符变成了:“\t”
                    //去掉
    后面或者前面的
    ,避免输出的文本中根据
    换行
                    columnArr = lineStr.Replace("	", "\t").Replace('
    ', ' ').Split(new string[] { separator }, StringSplitOptions.None);
                    if (columnArr.Length < columnNum)
                    {
                        return "";
                    }
                    result = columnArr[columnNum - 1];
                }
                catch (Exception)
                {
                    result = "ExceptionLine";
                    //throw;
                }
                return result;
            }
    
        }
    
        public struct TongjiParamsInfoStruct
        {
            public string inputPath { get; set; }
            public string outputPath { get; set; }
            public string separator { get; set; }
            public int columnNum { get; set; }
    
        }
    }
    

      调用代码:

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Diagnostics;
    using System.Drawing;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    
    namespace DaZhongLogTool
    {
        public partial class Form3 : Form
        {
            public Form3()
            {
                InitializeComponent();
            }
    
            Color originalTongJiButtonColor;
            string originalTongJiButtonText;
    
            BigFileTongJiJobs tongjiJobs = new BigFileTongJiJobs();
    
            private void btnTongJi_Click(object sender, EventArgs e)
            {
                if (tongjiJobs.StartFlag)
                {
                    MessageBox.Show("正在处理中...,如需停止,请单击停止");
                    return;
                }
    
                string errmsg;
                if(string.IsNullOrEmpty(txtSeparator.Text))
                {
                    MessageBox.Show("请输入分隔符");
                    return;
                }
                if (numericUpDown1.Value<1)
                {
                    MessageBox.Show("请输入按照分隔符分割的待统计内容的对应的列数,从1开始");
                    return;
                }
    
                string inputPath = txtInputPath.Text.Trim();
                if (string.IsNullOrEmpty(inputPath))
                {
                    MessageBox.Show("请输入等待统计的文本路径");
                    return;
                }
                if(!File.Exists(inputPath))
                {
                    MessageBox.Show("待统计的文本文件不存在,请重新输入");
                    return;
                }
    
    
                TongjiParamsInfoStruct paramsInfo = new TongjiParamsInfoStruct();
                paramsInfo.inputPath = inputPath;
                paramsInfo.outputPath = System.IO.Path.GetDirectoryName(inputPath)+@""+DateTime.Now.ToString("yyyyMMdd_HHmm")+"_result.log";
                paramsInfo.separator = txtSeparator.Text;
                paramsInfo.columnNum = (int)numericUpDown1.Value;
    
                originalTongJiButtonColor = this.btnTongJi.BackColor;
                originalTongJiButtonText = this.btnTongJi.Text;
    
                ////开始分析前,改变按钮颜色及文字
                //this.btnTongJi.Enabled = false;
                //this.btnTongJi.BackColor = Color.Gray;
                //this.btnTongJi.Text = "处理中……";
    
                tongjiJobs.StartFlag = true;
                tongjiJobs.ValueChanged += new ValueChangedEnentHandler(Line_ValueChange);
    
                Func<TongjiParamsInfoStruct, int> hander = new Func<TongjiParamsInfoStruct, int>(tongjiJobs.StartAnalyseBigFile);
                hander.BeginInvoke(paramsInfo, new AsyncCallback(AsyncCallback1), hander);
               
            }
    
            // 结束异步操作
            private void AsyncCallback1(IAsyncResult ar)
            {
                // 标准的处理步骤
                Func<TongjiParamsInfoStruct, int> handler = ar.AsyncState as Func<TongjiParamsInfoStruct, int>;
                int result= handler.EndInvoke(ar);
    
                if (result>0)
                {
                    MessageBox.Show("本次成功处理了" + result + "行数据", "成功");
                }
                else if (result == -2)
                {
                    MessageBox.Show("文件不存在,请重新选择");
                }
                toolStripStatusLabel1.Text = "上次任务处理完毕,等待下次开始。" + DateTime.Now.ToString();
    
                tongjiJobs.StartFlag = false;//处理过程停止
    
                //恢复按钮颜色及文字
                //this.btnTongJi.Enabled = true;
                //this.btnTongJi.BackColor = originalTongJiButtonColor;
                //this.btnTongJi.Text = originalTongJiButtonText;
    
            }
    
    
            private void Line_ValueChange(object sender ,ValueEventArgs e)
            {
                toolStripStatusLabel1.Text = string.Format("统计中……,已处理了{0}行日志,时间:{1}", e.Value, DateTime.Now);
            }  
    
    
            private void btnSelectFile_Click(object sender, EventArgs e)
            {
                OpenFileDialog fileDialog = new OpenFileDialog();
                fileDialog.Multiselect = false;
                fileDialog.Filter = "(*.*)|*.*";
                fileDialog.RestoreDirectory = false;
    
                if (fileDialog.ShowDialog() == DialogResult.OK)
                {
                    try
                    {
                      txtInputPath.Text=fileDialog.FileName;
                    }
                    catch (Exception ex)
                    {
                        MessageBox.Show("Error: Could not read file from disk. Original error: " + ex.Message);
                    }
                }
            }
    
            private void btnStopTongJi_Click(object sender, EventArgs e)
            {
                //点击停止按钮
                tongjiJobs.StartFlag = false; 
            }
           
    
        }
    
    
    
    
       
    }
    

      该工具,是来自于实际工作的需求,用于根据某一列统计次数。简单实用。

     源代码下载:【源码大文件分组统计简单工具           【EXE】大文件分组统计简单工具

    需要的小伙伴尽管拿走,不要忘记推荐一下,谢谢

  • 相关阅读:
    Go语言从入门到放弃(结构体常见的tag)
    Go语言从入门到放弃(设置 go get 为国内源)
    AndroidStuidio安装
    ADB常用命令
    win10安装Nodejs
    VsCode配置Go语言插件
    Visual Studio Code使用指南
    Go语言从入门到放弃(四)
    CentOs7.5安装Redis
    InnoDB INFORMATION_SCHEMA FULLTEXT Index Tables
  • 原文地址:https://www.cnblogs.com/wangqiideal/p/9641984.html
Copyright © 2011-2022 走看看