zoukankan      html  css  js  c++  java
  • Hortonworks 用于做 Sentimental Analysis的Hiveddl.sql 文件

    The hiveddl.sql script has performed the following steps to refine the data:

    • Converted the raw Twitter data into a tabular format.
    • Used the dictionary file to score the sentiment of each Tweet by the number of positive words compared to the number of negative words, and then assigned a positive, negative, or neutral sentiment value to each Tweet.
    • Created a new table that includes the sentiment value for each Tweet.

    http://hortonworks.com/hadoop-tutorial/how-to-refine-and-visualize-sentiment-data/

    ADD JAR json-serde-1.1.6-SNAPSHOT-jar-with-dependencies.jar;
    
    --create the tweets_raw table containing the records as received from Twitter
    
    CREATE EXTERNAL TABLE tweets_raw (
       id BIGINT,
       created_at STRING,
       source STRING,
       favorited BOOLEAN,
       retweet_count INT,
       retweeted_status STRUCT<
          text:STRING,
          user:STRUCT<screen_name:STRING,name:STRING>>,
       entities STRUCT<
          urls:ARRAY<STRUCT<expanded_url:STRING>>,
          user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,
          hashtags:ARRAY<STRUCT<text:STRING>>>,
       text STRING,
       user STRUCT<
          screen_name:STRING,
          name:STRING,
          friends_count:INT,
          followers_count:INT,
          statuses_count:INT,
          verified:BOOLEAN,
          utc_offset:STRING, -- was INT but nulls are strings
          time_zone:STRING>,
       in_reply_to_screen_name STRING,
       year int,
       month int,
       day int,
       hour int
    )
    ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
    LOCATION '/user/hue/upload/upload/data/tweets_raw'
    ;
    
    -- create sentiment dictionary
    CREATE EXTERNAL TABLE dictionary (
        type string,
        length int,
        word string,
        pos string,
        stemmed string,
        polarity string
    )
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '	' 
    STORED AS TEXTFILE
    LOCATION '/user/hue/upload/upload/data/dictionary';
    
    CREATE EXTERNAL TABLE time_zone_map (
        time_zone string,
        country string,
        notes string
    )
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '	' 
    STORED AS TEXTFILE
    LOCATION '/user/hue/upload/upload/data/time_zone_map';
    
    -- Clean up tweets
    CREATE VIEW tweets_simple AS
    SELECT
      id,
      cast ( from_unixtime( unix_timestamp(concat( '2013 ', substring(created_at,5,15)), 'yyyy MMM dd hh:mm:ss')) as timestamp) ts,
      text,
      user.time_zone 
    FROM tweets_raw
    ;
    
    CREATE VIEW tweets_clean AS
    SELECT
      id,
      ts,
      text,
      m.country 
     FROM tweets_simple t LEFT OUTER JOIN time_zone_map m ON t.time_zone = m.time_zone;
    
    -- Compute sentiment
    create view l1 as select id, words from tweets_raw lateral view explode(sentences(lower(text))) dummy as words;
    create view l2 as select id, word from l1 lateral view explode( words ) dummy as word ;
    
    -- was: select * from l2 left outer join dict d on l2.word = d.word where polarity = 'negative' limit 10;
    
    create view l3 as select 
        id, 
        l2.word, 
        case d.polarity 
          when  'negative' then -1
          when 'positive' then 1 
          else 0 end as polarity 
     from l2 left outer join dictionary d on l2.word = d.word;
     
     create table tweets_sentiment stored as orc as select 
      id, 
      case 
        when sum( polarity ) > 0 then 'positive' 
        when sum( polarity ) < 0 then 'negative'  
        else 'neutral' end as sentiment 
     from l3 group by id;
    
    -- put everything back together and re-number sentiment
    CREATE TABLE tweetsbi 
    STORED AS ORC
    AS
    SELECT 
      t.*,
      case s.sentiment 
        when 'positive' then 2 
        when 'neutral' then 1 
        when 'negative' then 0 
      end as sentiment  
    FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id;
    
    -- for Tableau or Excel
    -- UDAF sentiscore = sum(sentiment)*50  / count(sentiment)
    
    -- context n-gram made readable
    CREATE TABLE twitter_3grams
    STORED AS RCFilese
    AS
    SELECT year, month, day, hour, snippet 
    FROM
    ( SELECT
        year,
        month,
         day,
         hour,
         context_ngrams(sentences(lower(text)), array("iron","man","3",null,null,null), 10) ngs
      FROM tweets group by year,month,day, hour 
    ) base
     LATERAL VIEW
         explode(  ngs  ) ngsTab AS snippet -- ngsTab is random alias => must be there even though not used
    ;
  • 相关阅读:
    CXB 闯关游戏
    CXB 移动“哨兵棋子”
    GHOJ 300 Hanoi塔
    攻防世界 web 进阶区 刷题记录
    攻防世界 web 新手练习 刷题记录
    TensorFlow01:增加变量显示+tensorboard可视化
    TensorFlow01:梯度下降
    TensorFlow01:张量
    01深度学习介绍
    05Python爬虫:响应内容写入文件
  • 原文地址:https://www.cnblogs.com/datascientist/p/3461112.html
Copyright © 2011-2022 走看看