zoukankan      html  css  js  c++  java
  • hive学习8(小案例1练习)

    创建数据库

    hive> create database feigu;
    hive> use feigu;
    
    
    

    创建表

    • stg_job表
    drop table if exists stg_job;
    create table if not exists stg_job(
    web_id string comment 'web id',
    web_type string comment 'web type',
    job_url string comment 'job url',
    job_name string comment 'job name',
    job_location string comment 'job location',
    job_desc string comment 'job desc',
    edu string comment 'education',
    gender string comment 'gender',
    language string comment 'language',
    major string comment 'major',
    work_year string comment 'work years',
    salary string comment 'salary',
    company_name string comment 'company name',
    company_desc string comment 'company desc',
    company_address string comment 'company address',
    company_worktype string comment 'company worktype',
    company_scale string comment 'company scale',
    company_prop string comment 'company property',
    company_website string comment 'company_website',
    curl_timestamp string comment 'curl timestamp'
    )
    comment 'all flat data from webpage'
    partitioned by (`pt` string comment 'job post date ')
    row format delimited 
    fields terminated by '01'
    null defined as ''
    stored as textfile;
    
    
    
    
    
    • s_job表(与stg_job相同的表结构)
    create table s_job like stg_job;
    
    
    
    • stg_news表
    
    
    drop table if exists stg_news;
    create table if not exists stg_news(
    mysql_newsid string,
    news_title string,
    content string,
    create_time string
    )
    comment 'all flat thread from Dz'
    partitioned by (`pt` string )
    row format delimited
    fields terminated by '01'
    null defined as ''
    stored as textfile;
    
    
    • dm_job表
    
    drop table if exists dm_job;
    create table if not exists dm_job(
    web_id string comment 'web id',
    web_type string comment 'web type',
    job_url string comment 'job url',
    job_name string comment 'job name',
    job_location string comment 'job location',
    job_desc string comment 'job desc',
    edu string comment 'education',
    gender string comment 'gender',
    language string comment 'language',
    major string comment 'major',
    work_year string comment 'work years',
    salary string comment 'salary',
    job_date string comment 'job date',
    company_name string comment 'company name',
    company_desc string comment 'company desc',
    company_address string comment 'company address',
    company_worktype string comment 'company worktype',
    company_scale string comment 'company scale',
    company_prop string comment 'company property',
    company_website string comment 'company_website',
    curl_timestamp string comment 'curl timestamp',
    vip_flg string
    )
    comment 'compute vip '
    partitioned by (`pt` string)
    row format delimited 
    fields terminated by '01'
    null defined as ''
    stored as sequencefile;
    
    • dim_edu表
    
    drop table if exists dim_edu;
    create table if not exists dim_edu(
    web_type string,
    job_name string,
    company_name string,
    edu_detail string,
    edu_type string
    )
    comment 'edu dimision'
    partitioned by (`pt` string)
    row format delimited 
    fields terminated by '01'
    null defined as ''
    stored as sequencefile;
    
    • dim_workyear表
    
    drop table if exists dim_workyear;
    create table if not exists dim_workyear(
    web_type string,
    job_name string,
    company_name string,
    workyear_detail string,
    workyear_type string
    )
    comment 'work years'
    partitioned by (`pt` string)
    row format delimited 
    fields terminated by '01'
    null defined as ''
    stored as sequencefile;
    
    • dim_joblocation表
    
    drop table if exists dim_joblocation;
    create table if not exists dim_joblocation(
    web_type string,
    job_name string,
    company_name string,
    joblocation_detail string,
    joblocation_type string
    )
    comment 'job location'
    partitioned by (`pt` string)
    row format delimited 
    fields terminated by '01'
    null defined as ''
    stored as sequencefile;
    
    
    • dim_salary表
    drop table if exists dim_salary;
    create table if not exists dim_salary(
    web_type string,
    job_name string,
    company_name string,
    salary_detail string,
    salary_type string
    )
    comment 'job salary'
    partitioned by (`pt` string)
    row format delimited 
    fields terminated by '01'
    null defined as ''
    stored as sequencefile;
    

    数据导入

    将爬虫爬取的职位表信息导入到stg_job表中
    hive> load data local inpath '/home/data/daily/20150501/51job1.dat'
        > overwrite into table stg_job
        > partition (pt='20150501');
    
    
    
    

    hive数据清洗(ETL)

    • 数据项为空:网页抓取下来的数据可能是空的需要剔除
    • 检索结果不一致:编码或命名差异,例如品牌=耐克,商品品牌=耐克
    • 噪声:包含错误或者异常值,如salary='-100'

    数据预处理分为数据清理,数据变换,数据集成

    对job_location(工作地点), edu(学历), work_year(工作年限),salary(薪资范围)4列数据的空值进行转换
    hive> insert overwrite table s_job partition (pt)
        > select 
        > web_id,web_type,job_url,job_name,
        > case when job_location is null or trim(job_location) = "" then "--" else job_location end 
        > job_location,
        > job_desc,
        > case when edu is null or trim(edu) = "" then "--" else edu end 
        > edu,
        > gender,language,major,
        > case when work_year is null or trim(work_year) =  "" then "--" else work_year end 
        > work_year,
        > case when salary is null or trim(salary) = "" then "--" else salary end 
        > salary,
        > company_name,company_desc,company_address,
        > company_worktype,company_scale,company_prop,company_website,curl_timestamp,
        > pt from stg_job
        > where pt="20150501";
    
    
    

    代码注释

    • insert overwrite table...partition (...) select 将查询结构集写入另一个表中
    • partition(pt):在目标表中使用了动态分区,会在s_job表中自动创建分区
    • overwrite会先删除s_job中pt='20150501'分区的数据,避免相同分区下的数据重复导入

    hive提取维度信息

    抽取“学历要求”维度信息插入到学历维度表

    hive> insert into table dim_edu partition (pt)
        > select web_type,job_name,company_name,
        > edu as edu_detail,
        > case 
        > when (edu like '%大专%' = true or edu like '%专科%' = true) then 'B1'
        > when (edu like '%本科%' = true) then 'B2'
        > when (edu like '%硕士%' = true or edu like '%研究生%' = true) then 'B3'
        > else 'B9' 
        > end
        > as edu_type,
        > pt
        > from s_job where s_job.pt='20150501';
    
    

    抽取“工作地点”维度信息插入到工作地点维度表

    hive> insert into table dim_joblocation partition (pt)
        > select web_type,job_name,company_name,
        > job_location as joblocation_detail,
        > case
        > when (job_location like '%北京%' = true) then 'A1'
        > when (job_location like '%上海%' = true) then 'A2'
        > when (job_location like '%广州%' = true) then 'A3'
        > when (job_location like '%深圳%' = true) then 'A4'
        > else 'A9'
        > end
        > as joblocation_type,
        > pt
        > from s_job where s_job.pt='20150501';
    
    
    
  • 相关阅读:
    BZOJ.2199.[USACO2011 Jan]奶牛议会(2-SAT)
    BZOJ.1997.[HNOI2010]Planar(2-SAT)
    POJ.3648.Wedding(2-SAT)
    POJ.3678.Katu Puzzle(2-SAT)
    POJ.3207.Ikki's Story IV-Panda's Trick(2-SAT)
    洛谷.4180.[模板]次小生成树Tree(Kruskal LCA 倍增)
    BZOJ.4766.文艺计算姬(Prufer)
    zabbix 微信告警机制
    网络地址
    tcp与udp的区别
  • 原文地址:https://www.cnblogs.com/wujiadong2014/p/6195685.html
Copyright © 2011-2022 走看看