zoukankan      html  css  js  c++  java
  • 数学建模之Python操作csv文件

    1.用Python通过csv文件里面的某一列,形成键值,然后统计键在其他列出现的次数。

    import pandas as pd
    import numpy as np
    import csv
    import codecs
    import sys
    
    data_original = pd.read_csv('D:/csv_data_original.csv')
    data = pd.read_csv('D:/week1.csv')
    #data = data['retweeted_status_mid'].fillna('NOT PROVIDED',inplace=True)
    #data_transpond = data[data['retweeted_status_mid'] != 'NOT PROVIDED']
    
    #每条原创微博转发次数统计
    def statistics(path1,path2):
        num1 = 0
        num2 = 0
        #这块代码用来形成键值,初始化为0
        with open(path2, 'r', encoding="iso-8859-1") as f:
            reader2 = csv.reader(f)
            data_head2 = next(reader2)
            print(data_head2)
            data_line = next(reader2)
            while(data_line):
                if data_line[0] not in mid.keys():
                    mid[data_line[0].encode("iso-8859-1").decode("gbk", "ignore")] = 0
                    num2 += 1
                    print("正在创建第" + str(num2) + "个键")
                try:
                    data_line = next(reader2)
                except StopIteration:
                    print("数据处理完毕,键值完全形成" + str(num2) + "!")
                    break
                    #sys.exit()
            f.close()
        #这块代码用来统计每个键出现的次数
        with open(path1, 'r', encoding="iso-8859-1") as f:
            reader1 = csv.reader(f)
            data_head1 = next(reader1)
            print(data_head1)
            data_line = next(reader1)
            while(data_line):
                if data_line[1] in mid.keys():
                    mid[data_line[1].encode("iso-8859-1").decode("gbk", "ignore")] += 1
                    print("这条微博被转发" + str(mid[data_line[1]]) + "次")
                try:
                    data_line = next(reader1)
                except StopIteration:
                    print("数据处理完毕,转发次数统计完毕")
                    break
                    #sys.exit()
            f.close()
    #字典转化为列表
    def transpond(dict):
        global list_key#保存键
        global list_value#保存值
        list_key = list(dict)
        list_value = list(dict.values())
    
    
    #将数据写入csv文件
    def data_write_csv(file_name, list1,list2):#file_name为写入CSV文件的路径,datas为要写入数据列表
        with open(file_name,'w',newline='') as f:
            writer = csv.writer(f)
            writer.writerows(zip(list1, list2))
    
    
    if __name__ == "__main__":
        path_data = 'D:/week1.csv'  # 原始数据路径
        path_data_original = 'D:/csv_data_original.csv'  # 处理后只含原创的微博数据路径
        path_save = 'D:/transpond_data.csv'  # 保存处理后的数据
        mid = {}  # 定义字典用来保存每条原创微博被转发的次数
        list_key = []  # 保存键
        list_value = []  # 保存值
        statistics(path_data,path_data_original)
        transpond(mid)
        data_write_csv(path_save,list_key,list_value)
    

    2.与1类似的操作,具体有一些细节变动,代码中有注释

    import csv
    import pandas as pd
    
    #每条原创微博转发次数统计
    def statistics(path1,path2):
        num2 = 0
        #这块代码用来形成键值,初始化为0
        with open(path2, 'r', encoding="iso-8859-1") as f:
            reader2 = csv.reader(f)
            data_head2 = next(reader2)
            print(data_head2)
            data_line = next(reader2)
            while(data_line):
                if data_line[0] not in mid.keys():
                    mid[data_line[0].encode("iso-8859-1").decode("gbk", "ignore")] = 0
                    num2 += 1
                    print("正在创建第" + str(num2) + "个键")
                try:
                    data_line = next(reader2)
                except StopIteration:
                    print("数据处理完毕,键值完全形成" + str(num2) + "!")
                    break
                    #sys.exit()
            f.close()
        #这块代码用来统计每个键出现的次数
        with open(path1, 'r', encoding="iso-8859-1") as f:
            reader1 = csv.reader(f)
            data_head1 = next(reader1)
            print(data_head1)
            data_line = next(reader1)
            while(data_line):
                if data_line[2] in mid.keys():
                    mid[data_line[2].encode("iso-8859-1").decode("gbk", "ignore")] += int(data_line[1])
                    print("这个用户的微博被转发一共" + str(mid[data_line[2]]) + "次")
                try:
                    data_line = next(reader1)
                except StopIteration:
                    print("数据处理完毕,转发次数统计完毕")
                    break
                    #sys.exit()
            f.close()
    
    #字典转化为列表
    def transpond(dict):
        global list_key#保存键
        global list_value#保存值
        list_key = list(dict)
        list_value = list(dict.values())
    
    #将数据写入csv文件
    def data_write_csv(file_name, list1,list2):#file_name为写入CSV文件的路径,datas为要写入数据列表
        with open(file_name,'w',newline='') as f:
            writer = csv.writer(f)
            writer.writerows(zip(list1, list2))
    
    if __name__ == '__main__':
        path1 = 'D:/csv_data_original_num.csv'  # 用来形成键的数据路径
        path2 = 'D:/data_all.csv'  # 用来查找键值的数据路径
        path_save = 'D:/user_transpond.csv'  # 存放统计好的数据路径
        mid = {}
        list_key = []
        list_value = []
        statistics(path2,path1)
        transpond(mid)
        data_write_csv(path_save,list_key,list_value)
    

    3.将大数据的csv文件根据特定条件分成几份小文件

    #coding = utf-8
    import pandas as pd
    import csv
    
    
    def get_txt(path1,path2,path3,path4,path5,path6,path7,path8):
        num = 0
        with open(path1, 'r',encoding = 'utf-8') as f:
            txt1 = open(path2, "w", encoding='utf-8')
            txt2 = open(path3, "w", encoding='utf-8')
            txt3 = open(path4, "w", encoding='utf-8')
            txt4 = open(path5, "w", encoding='utf-8')
            txt5 = open(path6, "w", encoding='utf-8')
            txt6 = open(path7, "w", encoding='utf-8')
            txt7 = open(path8, "w", encoding='utf-8')
            reader1 = csv.reader(f)
            data_head1 = next(reader1)
            print(data_head1)
            data_line = next(reader1)
            while(data_line):
                num += 1
                print(num)
                print(data_line[6])
                if num > 0 and num < 700000:
                    txt1.write(data_line[6] + '
    ')
                elif num >= 700000 and num < 1400000:
                    txt2.write(data_line[6] + '
    ')
                elif num >= 1400000 and num < 2100000:
                    txt3.write(data_line[6] + '
    ')
                elif num >= 2100000 and num < 2800000:
                    txt4.write(data_line[6] + '
    ')
                elif num >= 2800000 and num < 3500000:
                    txt5.write(data_line[6] + '
    ')
                elif num >= 3500000 and num < 4200000:
                    txt6.write(data_line[6] + '
    ')
                elif num >= 4200000 and num < 4700000:
                    txt7.write(data_line[6] + '
    ')
                try:
                    data_line = next(reader1)
                except StopIteration:
                    print("数据处理完毕,转发次数统计完毕")
                    break
                    #sys.exit()
            f.close()
    if __name__ == '__main__':
        path1 = 'D:/week1.csv'
        path2 = 'D:/text1.txt'
        path3 = 'D:/text2.txt'
        path4 = 'D:/text3.txt'
        path5 = 'D:/text4.txt'
        path6 = 'D:/text5.txt'
        path7 = 'D:/text6.txt'
        path8 = 'D:/text7.txt'
        get_txt(path1,path2,path3,path4,path5,path6,path7,path8)
    
    作者:睿晞
    身处这个阶段的时候,一定要好好珍惜,这是我们唯一能做的,求学,钻研,为人,处事,交友……无一不是如此。
    劝君莫惜金缕衣,劝君惜取少年时。花开堪折直须折,莫待无花空折枝。
    曾有一个业界大牛说过这样一段话,送给大家:   “华人在计算机视觉领域的研究水平越来越高,这是非常振奋人心的事。我们中国错过了工业革命,错过了电气革命,信息革命也只是跟随状态。但人工智能的革命,我们跟世界上的领先国家是并肩往前跑的。能身处这个时代浪潮之中,做一番伟大的事业,经常激动的夜不能寐。”
    本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利.
  • 相关阅读:
    注册表
    windows.location.href在IE6下停止工作
    LINUX配置IP的三种方式
    InnoSetup 打包代码 检测.netFramework
    SQLiteHelper
    黑马程序员_看视频记笔记_C#编程基础02
    通过注册表来检测是否安装Office
    SQLiteHelper
    TSQL
    IIS下发布关于Excel导入导出时遇到的问题集锦
  • 原文地址:https://www.cnblogs.com/tsruixi/p/11406848.html
Copyright © 2011-2022 走看看