HIVE專案實戰

字段

備註詳細描述

video id

11位字串

uploader

agecategory

length

views

**次數

rate

滿分5分

ratings

流量conments

related ids

2．使用者表

表6-14 使用者表

字段備註

字段型別

uploader

上傳者使用者名稱

string

videos

intfriends

朋友數量

int

public string datarinse
(string str)
//將使用者中的空格替換掉
split[3]
= split[3]
.replaceall
(" ",""
);stringbuilder stringbuilder =
newstringbuilder()
;for
(int i =
0; i < split.length; i++
)else
}else
else}}
return stringbuilder.
tostring()
;}public
static
void
main
(string[
] args)

public
class
extends
}

public
class
etldriver
implements
tool
public
void
setconf
(configuration conf)
public configuration getconf()
public
static
void
main
(string[
] args)
throws exception 
}

create table gulivideo_ori( videoid string, uploader string, age int, category array, length int, views int, rate float, ratings int, comments int, relatedid array) row format delimited fields terminated by "\t" collection items terminated by "&"

stored as textfile;

create table gulivideo_user_ori( uploader string, videos int, friends int) row format delimited fields terminated by "\t"

stored as textfile;

select uploader,views from gulivideo_ori order by views desc

limit 10;

select	3.取出前十
t3.cate,t3.cou_cate
from
(select	2.統計沒類的熱度
t2.cate cate , count(*) cou_cate
from
(select t1.ca cate	1.將類別炸開
from gulivideo_ori lateral view explode(category) t1 as ca
)t2group by t2.cate
)t3order by t3.cou_cate
limit 10

select	3.對相同類別去重
distinct(cate) 
from
(cate,views,videoid
from
(select t1.ca cate,videoid,views	1.將類別炸開
from gulivideo_ori lateral view explode(category) t1 as ca
)t2order by views desc
limit 20
)t3

select 	5.排序rank
*from
(select 	4.將合併的表的類別字段炸開，對組進行分組，統計count
t4.category , count(*) hot
from
(select	3.然後與原表再連線join
*from
(distinct(relatedids_name)
from
(select *	1.取出**次數top50
from gulivideo_ori
order by views desc
limit 50
)t1lateral view explode(t1.relatedid) relatedids_t as relatedids_name
)t2join gulivideo_ori t3
where t2.relatedids_name=t3.videoid
)t4lateral view explode(t4.category) category_t as category_name
group by t4.category
)t5order by t5.hot desc

select 	2.取出top10
t1.categoryid,
t1.views,
from
(select 	1.按類別分組，**次數排序
categoryid,
views,
row_number() over(partition by categoryid order by views desc) rank
from gulivideo_category
)t1where rank <= 10

select 
t1.categoryid,
t1.ratings
from
(select 
categoryid,
ratings,
row_number() over(partition by categoryid order by ratings desc) rank
from gulivideo_category
)t1where rank <= 10

select
t2.uploader,
t2.views
from
(select
*from gulivideo_user_ori
order by videos desc
limit 20
)t1join
(select
*from gulivideo_ori
)t2where t1.uploader=t2.uploader
order by views desc
limit 20

select 
t1.categoryid,
t1.views,
from
(select 
categoryid,
views,
row_number() over(partition by categoryid order by views desc) rank
from gulivideo_category
)t1where rank <= 10

Hive專案實戰三

這裡總共需要建立4張表，明明只有兩個資料檔案，為什麼要建立4張表呢？因為這裡建立的表要使用orc的壓縮方式，而不使用預設的textfile的方式，orc的壓縮方式要想向表中匯入資料需要使用子查詢的方式匯入，即把從另一張表中查詢到的資料插入orc壓縮格式的表匯中，所以這裡需要四張表，兩張textfil...

hive 專案實戰 2

建表建立表這裡總共需要建立4張表，明明只有兩個資料檔案，為什麼要建立4張表呢？因為這裡建立的表要使用orc的壓縮方式，而不使用預設的textfile的方式，orc的壓縮方式要想向表中匯入資料需要使用子查詢的方式匯入，即把從另一張表中查詢到的資料插入orc壓縮格式的表匯中，所以這裡需要四張表，兩張t...

Hive專案實戰一

1.需求描述 2.資料來源結構說明資料來源1 user.txt 資料樣例資料樣例中的三個字段結構上傳者使用者名稱 string int朋友數量 int資料來源2 video.txt 資料樣例 fqshwyqgqsw lonelygirl15736 people blogs133 151763 ...

HIVE專案實戰

Hive專案實戰三

hive 專案實戰 2

Hive專案實戰一

相關推薦