Hive Hive架構及常規操作

建立內部表

--建立內部表 create table article(sentence string ) row format delimited fields terminated by '\n'; --從本地匯入資料:相當於將path資料hadoop fs -put /hive/warehouse/badou.db/ load data local inpath '/home/badou/mr/code/the_man_of_property.txt' into table article; --查詢資料 select * from article limit 3;

建立外部表

--外部表 -- hadoop fd -mkdir /data/ext -- hadoop fd -cp /data/the_man_of_property.txt /data/ext create external table article2 (sentence string) row format delimited fields terminated by '\n' stored as textfile location '/data/ext';

在hive/warehouse/badou.db下沒有外部表檔案，但是可以在表中查詢到資料

select word, count(1) as cnt
from (
select 
explode(split(sentence, ' ')) 
as word from article	
) tgroup by word
limit 100;

【注】執行hive前需要先將hadoop及mysql啟動

select word, count(1) as cnt
from (
select 
explode(split(sentence, ' ')) 
as word from article	
) tgroup by word
order by cnt desc
limit 100;

【注】order by 只會產生乙個reduce任務

內部表：資料需要儲存在hive目錄對應的資料夾下，即使hdfs上在其他路徑下已經存在外部表：可以直接呼叫hdfs上的資料

內部表外部表

資料需要儲存在hive目錄對應的資料夾下，即使hdfs上在其他路徑下已經存在

可以直接呼叫hdfs上的資料

create tabel name

create external table location 'hdfs_path' name(必須是資料夾路徑)

create table art_dt(sentence string)
partitioned by(dt string)
row format delimited fields terminated by '\n';

--從hive表中的資料插入到新錶（分割槽表）中
insert overwrite table art_dt partition(dt='20180924')
select * from article limit 100;
insert overwrite table art_dt partition(dt='20180925')
select * from article limit 100;
-- [root@master ~]# hadoop fs -ls /user/hive/warehouse/badou.db/art_dt
-- found 1 items
-- 2018-09-24 08:45 /user/hive/warehouse/badou.db/art_dt/dt=20180924

--檢視分割槽表列表 show partitions art_dt; select * from art_dt where dt between '20180924' and '20180925' limit 10;

每天都會產生使用者瀏覽，點選，收藏，購買的記錄。按照天的方式去儲存資料，按天做partition

資料庫中資料有使用者的屬性，年齡，性別， blog等每天有新增的使用者，修改資訊 dt=20180924 和dt=20180924會造成大量資訊冗餘。這個時候應該用 overwrite

overwrite++ 7 每天做overwrite dt = 20180922，這天中的資料報含這天之前的所有使用者資訊.

當天之前所有的全量資料。存7個分割槽，冗餘七份，防止丟失資料。

-- 建立表udata create table udata( user_id string , item_id string , rating string , `timestamp` string )row format delimited fields terminated by '\t' ;--匯入資料 load data local inpath '/home/badou/data/u.data' into table udata; --設定顯示欄位名（顯示表頭） set hive.cli.print.header=true; select * from udata limit 50;

-- 建立分桶表
create table bucket_users
(user_id string ,
item_id string ,
rating string,
`timestamp` string
)clustered by(user_id)
into 4 buckets;

set hive.enforce.bucketing = true;

-- 插入資料，將之前建立好的udata表中資料插入到4個分桶中，此時會產生4個reduce insert overwrite table bucket_users select cast(user_id as int) as user_id, item_id, rating, `timestamp` from udata;

Hive Hive架構及常規操作

oracle常規操作

ROS常規操作

Ubuntu常規操作

Hive Hive架構及常規操作

oracle常規操作

ROS常規操作

Ubuntu常規操作

相關推薦