zoukankan      html  css  js  c++  java
  • pandas(二):在pandas中搜索包含关键词的行

    一、代码

    # -*- coding: UTF-8 -*-
    import json
    import pandas as pd
    
    """获得所有的文本"""
    def get_all_text():
        file_path = "../datas/format/primary.json"
        names = []
        roles = []
        texts = []
        with open(file_path, "r", encoding="utf8") as f:
            for data_line in f.readlines():
                json_data = json.loads(data_line)
                file_name = json_data["file_name"]
                file_data = json_data["datas"]
                for k,v in file_data.items():
                    names.append(file_name)
                    roles.append(k)
                    texts.append(v)
        file_out = "../datas/format/all_text.csv"
        dataframe = pd.DataFrame({'names': names, 'roles': roles, "texts": texts})
        dataframe.to_csv(file_out, index=False, sep='	')
    
    """从csv搜索数据"""
    def search_text(key):
        file_out = "../datas/classes/" + key + ".csv"
        file_path = "../datas/format/all_text.csv"
        data = pd.read_csv(file_path, sep="	")
        da = data[data["texts"].str.contains(key)]
        da.to_csv(file_out, index=False, sep='	')
    
    """提取带有婚字的数据"""
    def data_annotate():
        file_in = "../datas/format/primary.json"
        file_out = "../datas/annotate/label.json"
        with open(file_out, "w", encoding="utf8") as fo:
            with open(file_in, "r", encoding="utf8") as f:
                for line in f.readlines():
                    item = {}
                    label = 0
                    json_data = json.loads(line)
                    for k,v in json_data["datas"].items():
                        if "" in v:
                            label = 1
                    if label == 1:
                        item["name"] = json_data["file_name"]
                        item["label"] = ""
                        item["datas"] = json_data["datas"]
                        fo.write(json.dumps(item, ensure_ascii=False) + "
    ")
        return "success"
    
    """提取标注过的数据"""
    def annotate():
        file_in = "../datas/annotate/label.json"
        file_labeled = "../datas/annotate/labeled.json"
        file_unlabeled = "../datas/annotate/unlabel.json"
        with open(file_in, "r", encoding="utf8") as f_in:
            with open(file_labeled, "w", encoding="utf8") as f_labeled:
                with open(file_unlabeled, "w", encoding="utf8") as f_unlabeled:
                    for line in f_in.readlines():
                        json_data = json.loads(line)
                        if json_data["label"]:
                            f_labeled.write(json.dumps(json_data, ensure_ascii=False) + "
    ")
                        else:
                            f_unlabeled.write(json.dumps(json_data, ensure_ascii=False) + "
    ")
        return "success"
    
    def label_to_csv():
        file_path = "../datas/annotate/labeled.json"
        labels = []
        datas = []
        data_dict = []
        with open(file_path, "r", encoding="utf8") as f:
            for data_line in f.readlines():
                json_data = json.loads(data_line)
                _label = json_data["label"]
                _data = "|".join(json_data["datas"].values())
                labels.append(_label)
                datas.append(_data)
                data_dict.append(data_line.replace("
    ", ""))
        file_out = "../datas/annotate/labeled.csv"
        dataframe = pd.DataFrame({'labels': labels, 'datas': datas, "data_dict": data_dict})
        dataframe.to_csv(file_out, index=False, sep='	')
    
    """提取带工作的数据"""
    def get_work():
        search_text("工作")
    
    if __name__ == '__main__':
        label_to_csv()
  • 相关阅读:
    【算法总结】多项式相关
    【算法总结】积性函数相关
    【算法总结】概率与期望相关
    【算法总结】博弈论相关
    【算法总结】线性代数相关
    【算法总结】根号算法相关
    【算法总结】计算几何相关
    【算法总结】组合数学相关
    【算法总结】字符串相关
    【算法总结】数论相关
  • 原文地址:https://www.cnblogs.com/zhangxianrong/p/14769616.html
Copyright © 2011-2022 走看看