zoukankan      html  css  js  c++  java
  • python处理json文件(Yelp数据集)

    python脚本处理yelp数据集

    import sys
    import json
    import re
    import os
    import time
    
    if __name__ == '__main__':
        
        dataset_sizes = {'sample': (40, 10), 'small': (1280, 320), 'medium': (32000, 8000), 'large': (2000000, 400000)}
        dataset_name = sys.argv[1]
        num_train, num_test = dataset_sizes[dataset_name]
    
        os.chdir('yelp_dataset')
        os.system('head -n ' + str(num_train + num_test) + ' review.json > temp.json')
        os.system('mv temp.json ..')
        os.chdir('..')
    
        with open('temp.json', 'r') as f:
            dataset = f.read().split('
    ')
            dataset.remove('')
    
        print("Done loading in dataset")
    
        try:
            os.mkdir('datasets')
        except:
            pass
        os.chdir('datasets')
        try:
            os.mkdir(dataset_name)
        except:
            pass
        os.chdir(dataset_name)
    
        train = open('train.txt', 'w')
        test = open('test.txt', 'w')
    
        reviews = [json.loads(review) for review in dataset]
    
        count = 1
        total = len(reviews)
        for review in reviews:
            review_id = review['review_id']
    
            num_stars = review['stars']
            if num_stars > 4.0:
                num_stars =  5
            elif num_stars > 2.0:
                num_stars =  3
            else:
                num_stars =  1
    
            review_text = review['text']
            review_text = re.sub(r'[^ws]','', review_text).replace("
    ", "").replace("
    ", "").lower()
            if num_train > 0:
                train.write(str(review_id) + ' ' + str(num_stars) + ' ' + str(review_text))
                train.write('
    ')
                num_train -= 1
            elif num_test > 0:
                test.write(str(review_id) + ' ' + str(num_stars) + ' ' + str(review_text))
                test.write('
    ')
                num_test -= 1
    
            bar_len = 30
            filled_len = int(round(bar_len * count / float(total)))
    
            percents = round(100.0 * count / float(total), 1)
            bar = '=' * filled_len + '-' * (bar_len - filled_len)
    
            sys.stdout.write("
    [%s] %s%s ... %d/%d reviews created" % (bar, percents, '%', count, total))
            sys.stdout.flush()
    
            count += 1
    
        sys.stdout.write("
    ")
    
        train.close()
        test.close()
        os.chdir('../../')
        os.system('rm -f temp.json')
    
    
  • 相关阅读:
    crm 4 注释与上传附件权限
    动态图片轮播
    PHP 连接 MSSQL
    php mssql 中文各种乱码
    百度地图逆地址解析
    Microsoft Visual C++ 2015 Redistributable(x64)
    服务器 vps 空间
    Python之路【第二篇】:Python基础(二)
    Python之路【第一篇】:Python简介和入门
    2016年会成为Java EE微服务年吗?
  • 原文地址:https://www.cnblogs.com/Willendless/p/12066555.html
Copyright © 2011-2022 走看看