The hiveddl.sql script has performed the following steps to refine the data:
- Converted the raw Twitter data into a tabular format.
- Used the dictionary file to score the sentiment of each Tweet by the number of positive words compared to the number of negative words, and then assigned a positive, negative, or neutral sentiment value to each Tweet.
- Created a new table that includes the sentiment value for each Tweet.
ADD JAR json-serde-1.1.6-SNAPSHOT-jar-with-dependencies.jar; --create the tweets_raw table containing the records as received from Twitter CREATE EXTERNAL TABLE tweets_raw ( id BIGINT, created_at STRING, source STRING, favorited BOOLEAN, retweet_count INT, retweeted_status STRUCT< text:STRING, user:STRUCT<screen_name:STRING,name:STRING>>, entities STRUCT< urls:ARRAY<STRUCT<expanded_url:STRING>>, user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>, hashtags:ARRAY<STRUCT<text:STRING>>>, text STRING, user STRUCT< screen_name:STRING, name:STRING, friends_count:INT, followers_count:INT, statuses_count:INT, verified:BOOLEAN, utc_offset:STRING, -- was INT but nulls are strings time_zone:STRING>, in_reply_to_screen_name STRING, year int, month int, day int, hour int ) ROW FORMAT SERDE '' LOCATION '/user/hue/upload/upload/data/tweets_raw' ; -- create sentiment dictionary CREATE EXTERNAL TABLE dictionary ( type string, length int, word string, pos string, stemmed string, polarity string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE LOCATION '/user/hue/upload/upload/data/dictionary'; CREATE EXTERNAL TABLE time_zone_map ( time_zone string, country string, notes string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE LOCATION '/user/hue/upload/upload/data/time_zone_map'; -- Clean up tweets CREATE VIEW tweets_simple AS SELECT id, cast ( from_unixtime( unix_timestamp(concat( '2013 ', substring(created_at,5,15)), 'yyyy MMM dd hh:mm:ss')) as timestamp) ts, text, user.time_zone FROM tweets_raw ; CREATE VIEW tweets_clean AS SELECT id, ts, text, FROM tweets_simple t LEFT OUTER JOIN time_zone_map m ON t.time_zone = m.time_zone; -- Compute sentiment create view l1 as select id, words from tweets_raw lateral view explode(sentences(lower(text))) dummy as words; create view l2 as select id, word from l1 lateral view explode( words ) dummy as word ; -- was: select * from l2 left outer join dict d on l2.word = d.word where polarity = 'negative' limit 10; create view l3 as select id, l2.word, case d.polarity when 'negative' then -1 when 'positive' then 1 else 0 end as polarity from l2 left outer join dictionary d on l2.word = d.word; create table tweets_sentiment stored as orc as select id, case when sum( polarity ) > 0 then 'positive' when sum( polarity ) < 0 then 'negative' else 'neutral' end as sentiment from l3 group by id; -- put everything back together and re-number sentiment CREATE TABLE tweetsbi STORED AS ORC AS SELECT t.*, case s.sentiment when 'positive' then 2 when 'neutral' then 1 when 'negative' then 0 end as sentiment FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on =; -- for Tableau or Excel -- UDAF sentiscore = sum(sentiment)*50 / count(sentiment) -- context n-gram made readable CREATE TABLE twitter_3grams STORED AS RCFilese AS SELECT year, month, day, hour, snippet FROM ( SELECT year, month, day, hour, context_ngrams(sentences(lower(text)), array("iron","man","3",null,null,null), 10) ngs FROM tweets group by year,month,day, hour ) base LATERAL VIEW explode( ngs ) ngsTab AS snippet -- ngsTab is random alias => must be there even though not used ;