zoukankan      html  css  js  c++  java
  • pandas处理行数据(apply的应用)

    # In[1]
    import os
    
    path = '/home/zjdou/jupyter/root/Smart-Writing/TextClassification/DATA'
    os.chdir(path)
    print(os.getcwd())
    
    # In[2]
    import pandas as pd
    import numpy as np
    
    file01 = pd.read_json('./raw_data/gov01.json')
    file01
    
    file02 = pd.read_json('./raw_data/gov02.json')
    file02
    
    total = file01.append(file02, ignore_index=True)
    
    # In[1]
    total
    
    # In[2]
    no_label_idx = total[total['topics'].apply(lambda x:x[0]) == ''].index
    no_label_idx
    
    total.drop(no_label_idx, inplace=True)
    total.reset_index(drop=True, inplace=True)
    
    # In[2]
    total['topics'] = total['topics'].apply(lambda x:x[0])
    total
    total.to_json('./swtc/total.json')
    
    # In[4]
    # 分割数据
    total = pd.read_json('./swtc/total.json')
    # idx = total[total['topics'].apply(lambda x:x.find('/') != -1)].index
    # total.iloc[idx]
    total['topics'] = total['topics'].apply(lambda x:x.replace('/', '-'))
    # idx = total[total['topics'].apply(lambda x:x.find('/') != -1)].index
    # total.iloc[idx]
    total.to_json('./swtc/total.json')
    total
    
    # In[1]
    total['topics'] = total['topics'].apply(lambda x:x if x.find('-') != -1 else x+'-其他')
    total.to_json('./swtc/total.json')
    
    # In[2]
    # 拓展数据
    total = pd.read_json('./swtc/smartwrite2_train.json')
    total
    # In[3]
    
    front_total = pd.DataFrame({'title':total.title, 
                                'content':total['content'].apply(lambda x:x[:len(x)//2]), 
                                'topics':total.topics})
    beh_total = pd.DataFrame({'title':total.title, 
                                'content':total['content'].apply(lambda x:x[len(x)//2:]), 
                                'topics':total.topics})
    total = front_total.append(beh_total, ignore_index=True)
    
    # 再分一遍
    front_total = pd.DataFrame({'title':total.title, 
                                'content':total['content'].apply(lambda x:x[:len(x)//2]), 
                                'topics':total.topics})
    beh_total = pd.DataFrame({'title':total.title, 
                                'content':total['content'].apply(lambda x:x[len(x)//2:]), 
                                'topics':total.topics})
    
    total = front_total.append(beh_total, ignore_index=True)
    # print(len(total.content[0]))
    # total.to_json('./swtc/smartwrite4_train.json')
    
    # In[2]
    import re
    # total = pd.read_json('./swtc/smartwrite3_train.json')
    total = pd.read_json('./swtc/new_total.json')
    total['extra'] = total['content'].apply(lambda x: ','.join(sorted(set(re.findall('《(.*?)》', x)), key=lambda x:len(x), reverse=True))).to_list()
    # In[3]
    total.to_json('total_and_extra.json')
    
    # In[4]
    shuffle_total = total.sample(frac=1).reset_index(drop=True)
    shuffle_total
    # In[1]
    total_len = len(shuffle_total)
    train_len = total_len // 10 * 8
    dev_len = total_len // 10 * 1
    test_len = total_len - train_len - dev_len
    print(train_len, dev_len, test_len)
    
    train = shuffle_total.iloc[:train_len]
    train.to_json('./swtc/smartwrite5_train.json')
    
    dev = shuffle_total.iloc[train_len:train_len + dev_len]
    dev.to_json('./swtc/smartwrite5_dev.json')
    
    test = shuffle_total.iloc[train_len+dev_len: train_len+dev_len+test_len]
    test.to_json('./swtc/smartwrite5_test.json')
    
    # %%
    # train.to_json('./swtc/')
    # pd.read_json('./swtc/smartwrite_dev.json')
    # a = pd.read_json('./raw_data/国务院部门文件_国务院政策文件库_中国政府网.json')
    # b = pd.read_json('./raw_data/国务院文件_国务院政策文件库_中国政府网.json')
    # len(a) + len(b)
    # %%
    train = pd.read_json('./swtc/smartwrite5_train.json')
    train
    dev = pd.read_json('./swtc/smartwrite5_dev.json')
    dev
    test = pd.read_json('./swtc/smartwrite5_test.json')
    test
    
    # In[1]
    total = pd.read_json('./swtc/total.json')
    topics = total['topics'].to_list()
    print(topics)
    # all_topics = len(topics)
    # print(all_topics)
    dup_topics = list(set(topics))
    print(dup_topics, len(dup_topics))
    # %%
    topics_dic = dict()
    for i, topic in enumerate(dup_topics):
        topics_dic[topic] = i
    
    print(topics_dic)
    
  • 相关阅读:
    2016/3/16 高级查询 ①连接查询 ②联合查询 ③子查询 无关 相关
    2016/3/13 七种查询 (普通查询 条件查询 排序查询 模糊查询 统计查询 分组查询 分页查询 )
    2016/3/13 MySQL 增删查改 CRUD 用代码实现
    2016/3/10 数据库简单操作( 创建数据库 创建表 数值类型 主键 外键 自动递增 )
    2016/3/10 PHP环境搭建 LAMP WAMP
    2016/3/10 PHP (超文本预处理器) 是什么?
    2016/3/1 淘宝 腾讯 网易 css初始化代码 以及最基础的初始化
    判断i在字符串中出现的次数(2016.1.12P141-1)
    2016-1-9作业——输出二维数组的和
    2016-1-8作业
  • 原文地址:https://www.cnblogs.com/douzujun/p/15221052.html
Copyright © 2011-2022 走看看