字段
備註詳細描述
video id
11位字串
uploader
agecategory
length
views
**次數
rate
滿分5分
ratings
流量conments
related ids
2.使用者表
表6-14 使用者表
字段備註
字段型別
uploader
上傳者使用者名稱
string
videos
intfriends
朋友數量
int
public string datarinse
(string str)
//將使用者中的空格替換掉
split[3]
= split[3]
.replaceall
(" ",""
);stringbuilder stringbuilder =
newstringbuilder()
;for
(int i =
0; i < split.length; i++
)else
}else
else}}
return stringbuilder.
tostring()
;}public
static
void
main
(string[
] args)
public
class
extends
}
public
class
etldriver
implements
tool
public
void
setconf
(configuration conf)
public configuration getconf()
public
static
void
main
(string[
] args)
throws exception
}
create table gulivideo_ori(
videoid string,
uploader string,
age int,
category array,
length int,
views int,
rate float,
ratings int,
comments int,
relatedid array)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
select uploader,views
from gulivideo_ori
order by views desc
limit 10;
select 3.取出前十
t3.cate,t3.cou_cate
from
(select 2.統計沒類的熱度
t2.cate cate , count(*) cou_cate
from
(select t1.ca cate 1.將類別炸開
from gulivideo_ori lateral view explode(category) t1 as ca
)t2group by t2.cate
)t3order by t3.cou_cate
limit 10
select 3.對相同類別去重
distinct(cate)
from
(cate,views,videoid
from
(select t1.ca cate,videoid,views 1.將類別炸開
from gulivideo_ori lateral view explode(category) t1 as ca
)t2order by views desc
limit 20
)t3
select 5.排序rank
*from
(select 4.將合併的表的類別字段炸開,對組進行分組,統計count
t4.category , count(*) hot
from
(select 3.然後與原表再連線join
*from
(distinct(relatedids_name)
from
(select * 1.取出**次數top50
from gulivideo_ori
order by views desc
limit 50
)t1lateral view explode(t1.relatedid) relatedids_t as relatedids_name
)t2join gulivideo_ori t3
where t2.relatedids_name=t3.videoid
)t4lateral view explode(t4.category) category_t as category_name
group by t4.category
)t5order by t5.hot desc
select 2.取出top10
t1.categoryid,
t1.views,
from
(select 1.按類別分組,**次數排序
categoryid,
views,
row_number() over(partition by categoryid order by views desc) rank
from gulivideo_category
)t1where rank <= 10
select
t1.categoryid,
t1.ratings
from
(select
categoryid,
ratings,
row_number() over(partition by categoryid order by ratings desc) rank
from gulivideo_category
)t1where rank <= 10
select
t2.uploader,
t2.views
from
(select
*from gulivideo_user_ori
order by videos desc
limit 20
)t1join
(select
*from gulivideo_ori
)t2where t1.uploader=t2.uploader
order by views desc
limit 20
select
t1.categoryid,
t1.views,
from
(select
categoryid,
views,
row_number() over(partition by categoryid order by views desc) rank
from gulivideo_category
)t1where rank <= 10
Hive專案實戰三
這裡總共需要建立4張表,明明只有兩個資料檔案,為什麼要建立4張表呢?因為這裡建立的表要使用orc的壓縮方式,而不使用預設的textfile的方式,orc的壓縮方式要想向表中匯入資料需要使用子查詢的方式匯入,即把從另一張表中查詢到的資料插入orc壓縮格式的表匯中,所以這裡需要四張表,兩張textfil...
hive 專案實戰 2
建表 建立表這裡總共需要建立4張表,明明只有兩個資料檔案,為什麼要建立4張表呢?因為這裡建立的表要使用orc的壓縮方式,而不使用預設的textfile的方式,orc的壓縮方式要想向表中匯入資料需要使用子查詢的方式匯入,即把從另一張表中查詢到的資料插入orc壓縮格式的表匯中,所以這裡需要四張表,兩張t...
Hive專案實戰一
1.需求描述 2.資料來源結構說明 資料來源1 user.txt 資料樣例 資料樣例中的三個字段結構 上傳者使用者名稱 string int朋友數量 int資料來源2 video.txt 資料樣例 fqshwyqgqsw lonelygirl15736 people blogs133 151763 ...