zoukankan html css js c++ java

Hortonworks 用于做 Sentimental Analysis的Hiveddl.sql 文件

The hiveddl.sql script has performed the following steps to refine the data:

Converted the raw Twitter data into a tabular format.
Used the dictionary file to score the sentiment of each Tweet by the number of positive words compared to the number of negative words, and then assigned a positive, negative, or neutral sentiment value to each Tweet.
Created a new table that includes the sentiment value for each Tweet.

http://hortonworks.com/hadoop-tutorial/how-to-refine-and-visualize-sentiment-data/

ADD JAR json-serde-1.1.6-SNAPSHOT-jar-with-dependencies.jar;

--create the tweets_raw table containing the records as received from Twitter

CREATE EXTERNAL TABLE tweets_raw (
   id BIGINT,
   created_at STRING,
   source STRING,
   favorited BOOLEAN,
   retweet_count INT,
   retweeted_status STRUCT<
      text:STRING,
      user:STRUCT<screen_name:STRING,name:STRING>>,
   entities STRUCT<
      urls:ARRAY<STRUCT<expanded_url:STRING>>,
      user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,
      hashtags:ARRAY<STRUCT<text:STRING>>>,
   text STRING,
   user STRUCT<
      screen_name:STRING,
      name:STRING,
      friends_count:INT,
      followers_count:INT,
      statuses_count:INT,
      verified:BOOLEAN,
      utc_offset:STRING, -- was INT but nulls are strings
      time_zone:STRING>,
   in_reply_to_screen_name STRING,
   year int,
   month int,
   day int,
   hour int
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
LOCATION '/user/hue/upload/upload/data/tweets_raw'
;

-- create sentiment dictionary
CREATE EXTERNAL TABLE dictionary (
    type string,
    length int,
    word string,
    pos string,
    stemmed string,
    polarity string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '	' 
STORED AS TEXTFILE
LOCATION '/user/hue/upload/upload/data/dictionary';

CREATE EXTERNAL TABLE time_zone_map (
    time_zone string,
    country string,
    notes string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '	' 
STORED AS TEXTFILE
LOCATION '/user/hue/upload/upload/data/time_zone_map';

-- Clean up tweets
CREATE VIEW tweets_simple AS
SELECT
  id,
  cast ( from_unixtime( unix_timestamp(concat( '2013 ', substring(created_at,5,15)), 'yyyy MMM dd hh:mm:ss')) as timestamp) ts,
  text,
  user.time_zone 
FROM tweets_raw
;

CREATE VIEW tweets_clean AS
SELECT
  id,
  ts,
  text,
  m.country 
 FROM tweets_simple t LEFT OUTER JOIN time_zone_map m ON t.time_zone = m.time_zone;

-- Compute sentiment
create view l1 as select id, words from tweets_raw lateral view explode(sentences(lower(text))) dummy as words;
create view l2 as select id, word from l1 lateral view explode( words ) dummy as word ;

-- was: select * from l2 left outer join dict d on l2.word = d.word where polarity = 'negative' limit 10;

create view l3 as select 
    id, 
    l2.word, 
    case d.polarity 
      when  'negative' then -1
      when 'positive' then 1 
      else 0 end as polarity 
 from l2 left outer join dictionary d on l2.word = d.word;
 
 create table tweets_sentiment stored as orc as select 
  id, 
  case 
    when sum( polarity ) > 0 then 'positive' 
    when sum( polarity ) < 0 then 'negative'  
    else 'neutral' end as sentiment 
 from l3 group by id;

-- put everything back together and re-number sentiment
CREATE TABLE tweetsbi 
STORED AS ORC
AS
SELECT 
  t.*,
  case s.sentiment 
    when 'positive' then 2 
    when 'neutral' then 1 
    when 'negative' then 0 
  end as sentiment  
FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id;

-- for Tableau or Excel
-- UDAF sentiscore = sum(sentiment)*50  / count(sentiment)

-- context n-gram made readable
CREATE TABLE twitter_3grams
STORED AS RCFilese
AS
SELECT year, month, day, hour, snippet 
FROM
( SELECT
    year,
    month,
     day,
     hour,
     context_ngrams(sentences(lower(text)), array("iron","man","3",null,null,null), 10) ngs
  FROM tweets group by year,month,day, hour 
) base
 LATERAL VIEW
     explode(  ngs  ) ngsTab AS snippet -- ngsTab is random alias => must be there even though not used
;

查看全文

相关阅读:
LG P4284 [SHOI2014]概率充电器
 LG P2592 [ZJOI2008]生日聚会
 LG P4953 [USACO02FEB]Cow Cycling
LG P2389 电脑班的裁员
 LG P2344 [USACO11FEB]Generic Cow Protests G
前端简历
 前端面试题目
 大前端的技术栈
 前端 -为什么要清楚浮动？
Redis的功能实现

原文地址：https://www.cnblogs.com/datascientist/p/3461112.html

最新文章
css--响应式网格布局
 css--伪元素
 css--双飞翼布局
 css--圣杯布局
 Mysql--常用函数
 Mysql--分区
 2.21
2.20
2.19
2.18