Hive solutions by dheerendra yadav: February 2017

Friday, 24 February 2017

Transpose column to row

Transpose or Pivot table or convert column to row

create table twit_seq STORED AS ORC tblproperties ("orc.compress"="NONE") as
select t1.consumer_id,t1.text,t1.rk
from
(SELECT consumer_id,text,postcreationtime1,rank() OVER (partition by consumer_id ORDER BY postcreationtime1 desc ) as rk,
row_number() over() as cnt
FROM dunami_v1)t1 where t1.rk < 6;

create table customer_tweet_info_dnami STORED AS ORC tblproperties ("orc.compress"="NONE") as
SELECT consumer_id ,
max(CASE WHEN rk = 1 THEN text else NULL END) AS Tweet_1,
max(CASE WHEN rk = 2 THEN text else NULL END) AS Tweet_2,
max(CASE WHEN rk = 3 THEN text else NULL END) AS Tweet_3,
max(CASE WHEN rk = 4 THEN text else NULL END) AS Tweet_4,
max(CASE WHEN rk = 5 THEN text else NULL END) AS Tweet_5
FROM twit_seq group by consumer_id;

week and rank function

Calculate weeks

select abs(floor(datediff(timestamp(from_unixtime(UNIX_TIMESTAMP(trim('01/04/16 00:00'),'MM/dd/yy hh:mm'))),postcreationtime1)/7)) as wk
from dunami_twitter_visual_data_v1 limit 100;

Use rank function to get value in descending order

create table twit_seq STORED AS ORC tblproperties ("orc.compress"="NONE") as
select t1.consumer_id,t1.text
from
(SELECT consumer_id,text,postcreationtime1,rank() OVER (partition by consumer_id ORDER BY postcreationtime1 desc ) as rk
FROM dunami_v1)t1 where t1.rk < 6;

Monday, 13 February 2017

Hive Optimization properties

set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
set hive.cbo.enable=true;
set hive.compute.query.using.stats=true;
set hive.stats.fetch.column.stats=true;
set hive.stats.fetch.partition.stats=true;
set mapreduce.job.mappers=1000;
set mapreduce.job.reducers=1000;
SET hive.exec.parallel=true;
-----------------------------------------------------------

set hive.vectorized.groupby.maxentries=10240;
set hive.vectorized.groupby.flush.percent=0.1;
set hive.cbo.enable=true;
set hive.compute.query.using.stats=true;
set hive.stats.fetch.column.stats=true;
set hive.stats.fetch.partition.stats=true;
set hive.cli.print.header = true;

Working with Tez and getting issue like tez.TezProcessor|: java.lang.OutOfMemoryError: Java heap space

Then set the properties on query level so that container size increase for that particular query.

SET hive.tez.java.opts=-Xmx8192m -Xms4096m

SET hive.tez.container.size=4026

Go through link -
https://discuss.pivotal.io/hc/en-us/articles/226239948-Hive-query-with-TEZ-engine-failed-with-exception-java-lang-OutOfMemoryError-Java-heap-space

Hive query output generate empty files along with other files then use below properties.
set hive.merge.tezfiles=true;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.merge.orcfile.stripe.level=true;

=================================================

Optimization for ORC files

set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set hive.vectorized.execution.reduce.groupby.enabled = true;
set hive.exec.parallel=true;
SET hive.tez.java.opts=-Xmx8192m -Xms4096m;
SET hive.tez.container.size=8192;
set hive.exec.parallel=true;

Sunday, 5 February 2017

Copy data from S3 to hdfs

hadoop distcp s3n://<access key>:<secret key>@buket/testhadoop/*/* /user/hadoop_user/test/

Date format in hive

Examples - 5/11/2015 12/31/2014 5/11/2015

select optin_date,cell_phone_date,optin_date_stamp from datetest;

select
to_date(from_unixtime(UNIX_TIMESTAMP(trim(Optin_Date_Stamp), 'MM/dd/yyyy'))),
to_date(from_unixtime(UNIX_TIMESTAMP(trim(cell_phone_date), 'MM/dd/yyyy'))) ,
to_date(from_unixtime(UNIX_TIMESTAMP(trim(optin_date_stamp), 'MM/dd/yyyy')))
FROM datetest;

----------------------------------------------------------------------------------------------------------
/*
3/10/2015 3:04:05 AM
4/11/2015 4:05:06 AM
5/13/2015 5:06:07 AM
5/31/2016 5:18:19 PM
4/29/2016 4:17:18 PM
10/20/2015 10:11:12 AM
*/

select timestamp(from_unixtime(UNIX_TIMESTAMP(trim(a), 'MM/dd/yyyy hh:mm:ss'))) FROM datetest2;

---------------------------------------------------------------------------------------------------------
2017-01-03T23:55:33.000Z

select timestamp(from_unixtime(unix_timestamp(regexp_replace(concat(concat(concat(substr(standardtimestamp,1,10),substr(standardtimestamp,12,2)),
substr(standardtimestamp,15,2)),substr(standardtimestamp,18,2)),'-',''),'yyyymmddhhmmss'))) as standardtimestamp from kdf_sample;

create table datetest(date1 string,date2 string,date3 string,date4 string) row format delimited fields terminated by ','
location '/user/dheerendra.a.yadav/testdata1/';

select
to_date(from_unixtime(UNIX_TIMESTAMP(trim(date1),'MM/dd/yyyy'))) as date1,
to_date(from_unixtime(UNIX_TIMESTAMP(trim(date2),'MM/dd/yyyy'))) as date2,
timestamp(from_unixtime(UNIX_TIMESTAMP(trim(date3),'MM/dd/yyyy hh:mm:ss'))) as date3,
timestamp(from_unixtime(unix_timestamp(regexp_replace(concat(concat(concat(substr(date4,1,10),substr(date4,12,2)),
substr(date4,15,2)),substr(date4,18,2)),'-',''),'yyyyMMddhhmmss'))) as date4
from datetest;

select standardtimestamp,timestamp(from_unixtime(unix_timestamp(regexp_replace(concat(concat(concat(substr(standardtimestamp,1,10),substr(standardtimestamp,12,2)),
substr(standardtimestamp,15,2)),substr(standardtimestamp,18,2)),'-',''),'yyyyMMddhhmmssss'))) as standardtimestamp from test limit 2

Compare dates with date in date format and date in date format:

select
sum(case when price_change_date > from_unixtime(unix_timestamp('2016-12-31' ,'yyyy-MM-dd'), 'yyyy-MM-dd') then 1 else 0 end) as price_change_date,
sum(case when listing_date > from_unixtime(unix_timestamp('2016-12-31' ,'yyyy-MM-dd'), 'yyyy-MM-dd') then 1 else 0 end) as listing_date,
sum(case when reduced_date > from_unixtime(unix_timestamp('2016-12-31' ,'yyyy-MM-dd'), 'yyyy-MM-dd') then 1 else 0 end) as reduced_date,
sum(case when distressed_date > from_unixtime(unix_timestamp('2016-12-31' ,'yyyy-MM-dd'), 'yyyy-MM-dd') then 1 else 0 end) as distressed_date
from fmdb.Interlinked_FM_DS_UPDT_V3;

When date in timestamp and you want to compare the date format with String format then use below format.

select from_unixtime(UNIX_TIMESTAMP(ins_gmt_ts,'yyyy-MM-dd'),'yyyyMMdd') as date1 from dbname_table_name_base limit 1;

========================================================

When you have interger or decimal value and you need to change in date format.

(1) Change integer or decimal value to string using CAST function.

(2) Convert into unix_timestamp with format e.g 'YYYYMMDD'

(3) Convert into from_unixtime()

(4) Convert into date_format() in required format e.g 'yyyy-MM-dd'

Example:

Date in decimal - birth_dt = 1956324.0

Solution:

SELECT date_format(from_unixtime(unix_timestamp(cast(birth_date as string),'YYYYMMDD')),'yyyy-MM-dd') as birth_dt from dummy

Result:

birth_dt = 1956-01-01

Wednesday, 1 February 2017

Json file parsing

{"shortIp":"38.126.127","os":"Windows","refDomain":"www.cnn.com","standardTimestamp":"2016-12-15T19:58:45.000Z","userAgent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0","deviceType":"Personal computer","estid":"CsY3SlcrcMZQT1Uvez0HAg==","browserFamily":"Firefox","mappedEvent":"pview","url":"http://money.cnn.com/2016/12/15/technology/apple-class-action-meal-breaks/index.html?iid=ob_homepage_tech_pool","iabCategories":[{"levelOne":"personal_finance","levelTwo":"financial_news","score":"medium"}],"ip":"38.126.127.126"}

drop table sdf;
create table sdf(
shortIp string
,os string
,refDomain string
,standardTimestamp string
,userAgent string
,deviceType string
,estid string
,browserFamily string
,mappedEvent string
,url string
,iabCategories array<struct<levelOne:string,levelTwo:string,score:string>>
,ip string)ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe';
-------------------------------------------------------------------------------

drop table ext_sdf;
create external table ext_sdf(
shortIp string
,os string
,refDomain string
,standardTimestamp string
,userAgent string
,deviceType string
,estid string
,browserFamily string
,mappedEvent string
,url string
,iabCategories array<struct<levelOne:string,levelTwo:string,score:string>>
,ip string)ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
LOCATION 'hdfs://127.0.0.1:9000/sdf';

select shortip
,os,useragent
,refDomain
,standardTimestamp
,userAgent
,deviceType
,estid
,browserFamily
,mappedEvent
,url
,iabcategories.levelone[0]
,iabcategories.levelTwo[0]
,iabcategories.score[0]
,ip
from ext_sdf;