zoukankan html css js c++ java

hive学习8（小案例1练习）

创建数据库

hive> create database feigu;
hive> use feigu;

创建表

stg_job表

drop table if exists stg_job;
create table if not exists stg_job(
web_id string comment 'web id',
web_type string comment 'web type',
job_url string comment 'job url',
job_name string comment 'job name',
job_location string comment 'job location',
job_desc string comment 'job desc',
edu string comment 'education',
gender string comment 'gender',
language string comment 'language',
major string comment 'major',
work_year string comment 'work years',
salary string comment 'salary',
company_name string comment 'company name',
company_desc string comment 'company desc',
company_address string comment 'company address',
company_worktype string comment 'company worktype',
company_scale string comment 'company scale',
company_prop string comment 'company property',
company_website string comment 'company_website',
curl_timestamp string comment 'curl timestamp'
)
comment 'all flat data from webpage'
partitioned by (`pt` string comment 'job post date ')
row format delimited 
fields terminated by '01'
null defined as ''
stored as textfile;

s_job表(与stg_job相同的表结构)

create table s_job like stg_job;

stg_news表



drop table if exists stg_news;
create table if not exists stg_news(
mysql_newsid string,
news_title string,
content string,
create_time string
)
comment 'all flat thread from Dz'
partitioned by (`pt` string )
row format delimited
fields terminated by '01'
null defined as ''
stored as textfile;

dm_job表


drop table if exists dm_job;
create table if not exists dm_job(
web_id string comment 'web id',
web_type string comment 'web type',
job_url string comment 'job url',
job_name string comment 'job name',
job_location string comment 'job location',
job_desc string comment 'job desc',
edu string comment 'education',
gender string comment 'gender',
language string comment 'language',
major string comment 'major',
work_year string comment 'work years',
salary string comment 'salary',
job_date string comment 'job date',
company_name string comment 'company name',
company_desc string comment 'company desc',
company_address string comment 'company address',
company_worktype string comment 'company worktype',
company_scale string comment 'company scale',
company_prop string comment 'company property',
company_website string comment 'company_website',
curl_timestamp string comment 'curl timestamp',
vip_flg string
)
comment 'compute vip '
partitioned by (`pt` string)
row format delimited 
fields terminated by '01'
null defined as ''
stored as sequencefile;

dim_edu表


drop table if exists dim_edu;
create table if not exists dim_edu(
web_type string,
job_name string,
company_name string,
edu_detail string,
edu_type string
)
comment 'edu dimision'
partitioned by (`pt` string)
row format delimited 
fields terminated by '01'
null defined as ''
stored as sequencefile;

dim_workyear表


drop table if exists dim_workyear;
create table if not exists dim_workyear(
web_type string,
job_name string,
company_name string,
workyear_detail string,
workyear_type string
)
comment 'work years'
partitioned by (`pt` string)
row format delimited 
fields terminated by '01'
null defined as ''
stored as sequencefile;

dim_joblocation表


drop table if exists dim_joblocation;
create table if not exists dim_joblocation(
web_type string,
job_name string,
company_name string,
joblocation_detail string,
joblocation_type string
)
comment 'job location'
partitioned by (`pt` string)
row format delimited 
fields terminated by '01'
null defined as ''
stored as sequencefile;

dim_salary表

drop table if exists dim_salary;
create table if not exists dim_salary(
web_type string,
job_name string,
company_name string,
salary_detail string,
salary_type string
)
comment 'job salary'
partitioned by (`pt` string)
row format delimited 
fields terminated by '01'
null defined as ''
stored as sequencefile;

数据导入

将爬虫爬取的职位表信息导入到stg_job表中

hive> load data local inpath '/home/data/daily/20150501/51job1.dat'
    > overwrite into table stg_job
    > partition (pt='20150501');

hive数据清洗（ETL）

数据项为空：网页抓取下来的数据可能是空的需要剔除
检索结果不一致：编码或命名差异，例如品牌=耐克，商品品牌=耐克
噪声：包含错误或者异常值，如salary='-100'

数据预处理分为数据清理，数据变换，数据集成

对job_location（工作地点）, edu（学历）, work_year（工作年限）,salary（薪资范围）4列数据的空值进行转换

hive> insert overwrite table s_job partition (pt)
    > select 
    > web_id,web_type,job_url,job_name,
    > case when job_location is null or trim(job_location) = "" then "--" else job_location end 
    > job_location,
    > job_desc,
    > case when edu is null or trim(edu) = "" then "--" else edu end 
    > edu,
    > gender,language,major,
    > case when work_year is null or trim(work_year) =  "" then "--" else work_year end 
    > work_year,
    > case when salary is null or trim(salary) = "" then "--" else salary end 
    > salary,
    > company_name,company_desc,company_address,
    > company_worktype,company_scale,company_prop,company_website,curl_timestamp,
    > pt from stg_job
    > where pt="20150501";

代码注释

insert overwrite table...partition (...) select 将查询结构集写入另一个表中
partition(pt):在目标表中使用了动态分区，会在s_job表中自动创建分区
overwrite会先删除s_job中pt='20150501'分区的数据，避免相同分区下的数据重复导入

hive提取维度信息

抽取“学历要求”维度信息插入到学历维度表

hive> insert into table dim_edu partition (pt)
    > select web_type,job_name,company_name,
    > edu as edu_detail,
    > case 
    > when (edu like '%大专%' = true or edu like '%专科%' = true) then 'B1'
    > when (edu like '%本科%' = true) then 'B2'
    > when (edu like '%硕士%' = true or edu like '%研究生%' = true) then 'B3'
    > else 'B9' 
    > end
    > as edu_type,
    > pt
    > from s_job where s_job.pt='20150501';

抽取“工作地点”维度信息插入到工作地点维度表

hive> insert into table dim_joblocation partition (pt)
    > select web_type,job_name,company_name,
    > job_location as joblocation_detail,
    > case
    > when (job_location like '%北京%' = true) then 'A1'
    > when (job_location like '%上海%' = true) then 'A2'
    > when (job_location like '%广州%' = true) then 'A3'
    > when (job_location like '%深圳%' = true) then 'A4'
    > else 'A9'
    > end
    > as joblocation_type,
    > pt
    > from s_job where s_job.pt='20150501';

查看全文

相关阅读:
232 前端之JQuery：JQuery扩展和事件
 231 前端之JQuery：JQuery文档操作
 Result(ActionResult、JsonResult、JavaScriptResult等)
Controller传值到前端页面的几种方式
 若要允许 GET 请求，请将 JsonRequestBehavior 设置为 AllowGet（转载）
过滤器
 ViewData丶ViewBag和TempData
C#-绘图双缓冲
 Kafka的架构原理，你真的理解吗？
Kafka的架构原理，你真的理解吗？

原文地址：https://www.cnblogs.com/wujiadong2014/p/6195685.html