hive (test)> desc formatted dept_partition ; OK col_name data_type comment # col_name data_type comment deptno int dname string loc string # Partition Information # col_name data_type comment day string 【分区信息】 # Detailed Table Information Database: test OwnerType: USER Owner: bigdata CreateTime: Sat Dec 19 07:42:11 CST 2020 LastAccessTime: UNKNOWN Protect Mode: None Retention: 0 Location: hdfs://hadoop001:9000/user/hive/warehouse/test.db/dept_partition Table Type: MANAGED_TABLE ...
导入数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14
hive (test)> LOAD DATA LOCAL INPATH '/home/bigdata/data/dept20190201.txt' OVERWRITE INTO TABLE dept_partition > partition(day=20190201); Loading data to table test.dept_partition partition (day=20190201) Partition test.dept_partition{day=20190201} stats: [numFiles=1, numRows=0, totalSize=79, rawDataSize=0] OK Time taken: 0.327 seconds hive (test)> select * from dept_partition where day = '20190201'; 【加上分区条件】 OK dept_partition.deptno dept_partition.dname dept_partition.loc dept_partition.day 10 ACCOUNTING NEW YORK 20190201 20 RESEARCH DALLAS 20190201 30 SALES CHICAGO 20190201 40 OPERATIONS BOSTON 20190201 Time taken: 0.224 seconds, Fetched: 4 row(s)
再导入其他分区数据,查看hdfs上的目录,dept_partition下有多个文件
查看MySQL的表数据:partitions
【注意】手动上传到HDFS的分区数据,元数据信息没有,直接查询没有数据
1 2 3 4 5 6 7 8 9
# 在HDFS上创建一个文件夹,路径与分表的分区路径一样 [bigdata@hadoop001 data]$ hadoop fs -mkdir -p /user/hive/warehouse/test.db/dept_partition/day=20190203 # 上传数据 [bigdata@hadoop001 data]$ hadoop fs -put dept20190203.txt /user/hive/warehouse/test.db/dept_partition/day=20190203/ # Hive查询 hive (test)> select * from dept_partition where day = '20190203'; OK dept_partition.deptno dept_partition.dname dept_partition.loc dept_partition.day Time taken: 0.055 seconds
修复元数据信息
【生产不用,会刷整个分区表,性能差】
MSCK REPAIR TABLE tablename;
1 2 3 4 5 6 7 8 9 10 11 12 13 14
hive (test)> MSCK REPAIR TABLE dept_partition; OK Partitions not in metastore: dept_partition:day=20190203 Repair: Added partition to metastore dept_partition:day=20190203 Time taken: 0.096 seconds, Fetched: 2 row(s) hive (test)> select * from dept_partition where day = '20190203'; OK dept_partition.deptno dept_partition.dname dept_partition.loc dept_partition.day 10 ACCOUNTING NEW YORK 20190203 20 RESEARCH DALLAS 20190203 30 SALES CHICAGO 20190203 40 OPERATIONS BOSTON 20190203 Time taken: 0.053 seconds, Fetched: 4 row(s) hive (test)>
增加分区
使用ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION (col_name=value) ;
1 2 3 4 5 6 7 8 9 10 11
hive (test)> ALTER TABLE dept_partition ADD IF NOT EXISTS PARTITION (day='20190204') ; OK Time taken: 0.041 seconds hive (test)> select * from dept_partition where day = '20190204'; OK dept_partition.deptno dept_partition.dname dept_partition.loc dept_partition.day 10 ACCOUNTING NEW YORK 20190204 20 RESEARCH DALLAS 20190204 30 SALES CHICAGO 20190204 40 OPERATIONS BOSTON 20190204 Time taken: 0.061 seconds, Fetched: 4 row(s)
删除分区
ALTER TABLE dept_partition DROP PARTITION(day=‘20300503’),PARTITION(day=‘20300502’);
1 2 3 4 5 6 7 8 9 10 11
hive (test)> alter table dept_partition drop partition(day='20190201'); Dropped the partition day=20190201 OK Time taken: 0.057 seconds hive (test)> show partitions dept_partition; OK partition day=20190202 day=20190203 day=20190204 day=20190205
查看分区信息
1 2 3 4 5 6 7 8 9
hive (test)> show partitions dept_partition; OK partition day=20190201 day=20190202 day=20190203 day=20190204 day=20190205 Time taken: 0.041 seconds, Fetched: 5 row(s)
新增分区数据
1 2
insert into table dept_partition partition(day='20190202') select * from dept;
多级分区
创建表
1 2 3 4 5 6 7
hive (test)> create table dept_partition_d_h( > deptno int, > dname string, > loc string > ) > PARTITIONED BY (day string, hour string) > ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
导入数据
1 2 3 4 5
hive (test)> LOAD DATA LOCAL INPATH '/home/bigdata/data/dept20190203.txt' OVERWRITE INTO TABLE dept_partition_d_h PARTITION (day='20190203', hour='21'); Loading data to table test.dept_partition_d_h partition (day=20190203, hour=21) Partition test.dept_partition_d_h{day=20190203, hour=21} stats: [numFiles=1, numRows=0, totalSize=79, rawDataSize=0] OK Time taken: 0.585 seconds
查看
1 2 3 4 5 6 7 8
hive (test)> select * from dept_partition_d_h where day='20190203'and hour='21'; OK dept_partition_d_h.deptno dept_partition_d_h.dname dept_partition_d_h.loc dept_partition_d_h.day dept_partition_d_h.hour 10 ACCOUNTING NEW YORK 20190203 21 20 RESEARCH DALLAS 20190203 21 30 SALES CHICAGO 20190203 21 40 OPERATIONS BOSTON 20190203 21 Time taken: 0.055 seconds, Fetched: 4 row(s)
select A.domain,A.month,max(A.traffic) ,sum(B.traffic) from (select domain,substr(regexp_replace(time,'/','-'),1,7) as month,sum(traffic) as traffic from ip_info group by domain,substr(regexp_replace(time,'/','-'),1,7)) A inner join (select domain,substr(regexp_replace(time,'/','-'),1,7) as month,sum(traffic) as traffic from ip_info group by domain,substr(regexp_replace(time,'/','-'),1,7)) B on A.domain=B.domain where B.month <= A.month group by A.domain,A.month order by A.domain,A.month;
hive (test)> create table emp_info( > id string, > name string, > dept string, > sex string > )ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; OK Time taken: 0.424 seconds hive (test)> LOAD DATA LOCAL INPATH '/home/bigdata/data/emp_info.txt' OVERWRITE INTO TABLE emp_info; Loading data to table test.emp_info Table test.emp_info stats: [numFiles=1, totalSize=92] OK Time taken: 0.965 seconds hive (test)> select * from emp_info; OK emp_info.id emp_info.name emp_info.dept emp_info.sex 1 JIM RD 1 2 PETTER RD 1 3 XIAOHONG RD 2 4 XIAOZHANG QA 1 5 XIAOLI QA 2 6 XIAOFANG QA 2 Time taken: 0.218 seconds, Fetched: 6 row(s)
需求:相同部门、性别的人合在一起
将部门和性别字段合并
1 2 3 4 5 6 7 8 9 10
hive (test)> select name,concat_ws(',',dept,sex)dept_sex from emp_info ; OK name dept_sex JIM RD,1 PETTER RD,1 XIAOHONG RD,2 XIAOZHANG QA,1 XIAOLI QA,2 XIAOFANG QA,2 Time taken: 0.09 seconds, Fetched: 6 row(s)
根据合并的字段分组,并且将姓名合在一起
1 2 3
hive (test)> select dept_sex,concat_ws('|',collect_set(t.name)) from >(select name,concat_ws(',',dept,sex)dept_sex from emp_info)t >group by dept_sex ;
列转行explode(split(column_name,’,’))
数据准备
1 2 3 4 5 6 7 8 9 10 11 12 13 14
hive (test)> create table column2row(name string, courses string) > row format delimited fields terminated by '\t'; OK Time taken: 0.038 seconds hive (test)> LOAD DATA LOCAL INPATH '/home/bigdata/data/column2row.txt' OVERWRITE INTO TABLE column2row; Loading data to table test.column2row Table test.column2row stats: [numFiles=1, totalSize=57] OK Time taken: 0.167 seconds hive (test)> select * from column2row; OK column2row.name column2row.courses JIM MapReduce,Hive,Spark,Flink PETTER Hadoop,HBase,Kafka
将姓名和课程分别展示
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
hive (test)> select > name, course > from > column2row > lateral view explode(split(courses,',')) course_tmp as course; OK name course JIM MapReduce JIM Hive JIM Spark JIM Flink PETTER Hadoop PETTER HBase PETTER Kafka Time taken: 0.07 seconds, Fetched: 7 row(s)
hive (test)> create table wordcount(sentence string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'; OK Time taken: 0.081 seconds hive (test)> LOAD DATA LOCAL INPATH '/home/bigdata/data/wordcount.txt' OVERWRITE INTO TABLE wordcount; Loading data to table test.wordcount Table test.wordcount stats: [numFiles=1, totalSize=41] OK Time taken: 0.802 seconds hive (test)> select * from wordcount; OK wordcount.sentence hello,hello,hello world,world welcome
拆分字段,分组查询统计
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
hive (test)> select > word, count(1) cnt > from > ( > select > explode(split(sentence,',')) as word > from > wordcount > ) t > group by word > order by cnt desc; word cnt hello 3 world 2 welcome 1