zoukankan      html  css  js  c++  java
  • 大数据加工平台py文件内容

    #coding:utf-8
    # coding:utf-8
    import matplotlib as matplotlib
    import numpy as np
    import pandas as pd


    import openpyxl

    from pandas import DataFrame, Series

    # 可视化显示在界面
    # matplotlib inline
    import matplotlib
    import matplotlib.pyplot as plt

    plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来显示中文
    plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

    import json
    import warnings

    warnings.filterwarnings('ignore')
    # 设置显示的最大列、宽等参数,消掉打印不完全中间的省略号
    # pd.set_option('display.max_columns', 1000)
    pd.set_option('display.width', 1000) # 加了这一行那表格的一行就不会分段出现了
    # pd.set_option('display.max_colwidth', 1000)
    # pd.set_option('display.height', 1000)
    # 显示所有列
    pd.set_option('display.max_columns', None)
    # 显示所有行
    pd.set_option('display.max_rows', None)
    movies = pd.read_csv('C:\\Users\\杜子轩\\Desktop\\王建民作业\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\tmdb_5000_movies.csv',
    encoding='utf_8')
    credits = pd.read_csv('C:\\Users\\杜子轩\\Desktop\\王建民作业\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\tmdb_5000_credits.csv',
    encoding='utf_8')
    movies.info() # 查看信息
    credits.info()
    # 两个数据框都有title列,以及movies.riginal_title
    # 以上三个数据列重复,删除两个
    del credits['title']
    del movies['original_title']

    # 连接两个csv文件
    merged = pd.merge(movies, credits, left_on='id', right_on='movie_id', how='left')

    # 删除不需要分析的列
    df = merged.drop(['homepage', 'overview', 'spoken_languages', 'status', 'tagline', 'movie_id'], axis=1)
    df.info()

    # 查找缺失值记录-release_date
    df[df.release_date.isnull()]

    # 查找缺失值记录-runtime
    df[df.runtime.isnull()]

    len(df.id.unique())

    df['release_year'] = pd.to_datetime(df.release_date, format='%Y-%m-%d', errors='coerce').dt.year
    df['release_month'] = pd.to_datetime(df.release_date).apply(lambda x: x.month)
    df['release_day'] = pd.to_datetime(df.release_date).apply(lambda x: x.day)
    df.info()

    df.describe()

    df = df[(df.vote_count >= 50) & (df.budget * df.revenue * df.popularity * df.vote_average != 0)].reset_index(
    drop='True')

    df

    json_column = ['genres', 'keywords', 'production_companies', 'production_countries', 'cast', 'crew']

    # 1-json本身为字符串类型,先转换为字典列表
    for i in json_column:
    df[i] = df[i].apply(json.loads)


    # 提取name
    # 2-将字典列表转换为以','分割的字符串
    def get_name(x):
    return ','.join([i['name'] for i in x])


    df['cast'] = df['cast'].apply(get_name)


    # 提取derector
    def get_director(x):
    for i in x:
    if i['job'] == 'Director':
    return i['name']


    df['crew'] = df['crew'].apply(get_director)

    for j in json_column[0:4]:
    df[j] = df[j].apply(get_name)

    # 重命名
    rename_dict = {'cast': 'actor', 'crew': 'director'}
    df.rename(columns=rename_dict, inplace=True)
    df.info()
    df.head(5)

    # 备份原始数据框original_df
    org_df = df.copy()

    df.to_excel('C:\\Users\\杜子轩\\Desktop\\王建民作业\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\result.xlsx', columns=["budget","genres","id","keywords","original_language","popularity","production_companies","production_countries","release_date","revenue","runtime","title","vote_average","vote_count","actor","director","release_year","release_month","release_day"], index=False,encoding='utf-8',sheet_name='Sheet1')
    #df.to_excel('C:\\Users\\杜子轩\\Desktop\\王建民作业\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\result.xlsx')

  • 相关阅读:
    JAVA 读取excel文件成List<Entity>
    JAVA 下载单个文件
    js替换指定位置字符串
    Java学习——继承
    Java——单例设计模式
    Java学习——static关键字
    个人成长阶段
    Android_xml背景色的值
    Android开发_关于点击事件
    Android开发_关于中英文切换
  • 原文地址:https://www.cnblogs.com/D10304/p/15700476.html
Copyright © 2011-2022 走看看