hive(2)
Load
语法:
LOAD DATA [LOCAL] INPATH ‘filepath’ [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 …)]
说明:
LOAD DATA:加载数据
LOCAL:可有可无
有:从本地[Hive客户端]加载数据到Hive表
无:从文件系统加载数据到Hive表
INPATH:加载数据的路径
OVERWRITE:可有可无
有:覆盖已有的数据 overwrite
无:追加 append
INTO TABLE:加载数据到哪个表中
-
创建表
1
2
3
4
5
6
7
8
9
10
11hive (default)> create table emp(
> empno int,
> ename string,
> job string,
> mgr int,
> hiredate string,
> sal double,
> comm double,
> deptno int
> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
> location '/data/hive/';
-
从本地上传文件到linux
1
[bigdata@hadoop001 data]$ rz
-
加载linux的文件到hive表
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23hive (default)> LOAD DATA LOCAL INPATH '/home/bigdata/data/emp.txt' OVERWRITE INTO TABLE emp4;
Loading data to table default.emp4
Table default.emp4 stats: [numFiles=0, totalSize=0]
OK
Time taken: 1.001 seconds
hive (default)> select * from emp4;
OK
emp4.empno emp4.ename emp4.job emp4.mgr emp4.hiredate emp4.sal emp4.comm emp4.deptno
7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30
7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.0 NULL 30
7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20
7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.0 0.0 30
7876 ADAMS CLERK 7788 1987-5-23 1100.0 NULL 20
7900 JAMES CLERK 7698 1981-12-3 950.0 NULL 30
7902 FORD ANALYST 7566 1981-12-3 3000.0 NULL 20
7934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10
8888 HIVE PROGRAM 7839 1988-1-23 10300.0 NULL NULL1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40hive (default)> LOAD DATA INPATH '/data/hive/emp2.txt' INTO TABLE emp4;
Loading data to table default.emp4
Table default.emp4 stats: [numFiles=0, totalSize=0]
OK
Time taken: 0.167 seconds
hive (default)> select * from emp4;
OK
emp4.empno emp4.ename emp4.job emp4.mgr emp4.hiredate emp4.sal emp4.comm emp4.deptno
7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30
7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.0 NULL 30
7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20
7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.0 0.0 30
7876 ADAMS CLERK 7788 1987-5-23 1100.0 NULL 20
7900 JAMES CLERK 7698 1981-12-3 950.0 NULL 30
7902 FORD ANALYST 7566 1981-12-3 3000.0 NULL 20
7934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10
8888 HIVE PROGRAM 7839 1988-1-23 10300.0 NULL NULL
7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30
7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.0 NULL 30
7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20
7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.0 0.0 30
7876 ADAMS CLERK 7788 1987-5-23 1100.0 NULL 20
7900 JAMES CLERK 7698 1981-12-3 950.0 NULL 30
7902 FORD ANALYST 7566 1981-12-3 3000.0 NULL 20
7934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10
8888 HIVE PROGRAM 7839 1988-1-23 10300.0 NULL NULL
Time taken: 0.033 seconds, Fetched: 30 row(s)
hive (default)>
CTAS 【目标表不存在】
1 | create table emp3 as select * from emp; |
INSERT
INSERT OVERWRITE TABLE tablename1 select_statement1 FROM from_statement;
INSERT INTO TABLE tablename1 select_statement1 FROM from_statement;
【0.14版本开始支持】
1 | hive (default)> show tables; |
【insert的表必须存在】
- INSERT INTO TABLE
1 | hive (default)> create table emp2 like emp4; |
- INSERT OVERWRITE TABLE
1 | hive (default)> INSERT OVERWRITE TABLE emp2 select * from emp4; |
-
FROM from_statement
INSERT OVERWRITE TABLE tablename1 select_statement1
【将一个表的结果插入不同的表中】
FROM emp
INSERT OVERWRITE TABLE result1 A业务
INSERT OVERWRITE TABLE result2 B业务
1 | hive (default)> FROM emp4 |
【坑】使用这种语法 字段的类型和数量两个表要匹配
1 | hive (default)> insert overwrite table emp2 select empno,ename from emp4; |
1 | hive (default)> INSERT OVERWRITE TABLE emp2 |
【字段名要按顺序一一匹配】
-
INSERT INTO TABLE tablename VALUES values_row [, values_row …]
准备数据
1
2
3
4
5
6
7
8
9[bigdata@hadoop001 data]$ hdfs dfs -put dept.txt /user/hive/warehouse/test.db/dept
hive (test)> create table dept(deptno int,dname string,loc string)
> ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
OK
[bigdata@hadoop001 data]$ hdfs dfs -ls dept.txt /user/hive/warehouse/test.db
Found 1 items
-rw-r--r-- 1 bigdata supergroup 79 2020-12-18 21:14 /user/hive/warehouse/test.db/dept/dept.txt多次执行
1
2
3
4
5
6
7
8
9
10
11hive (test)> INSERT INTO TABLE dept VALUES (12345, 'QA', 'CD');
hive (test)> INSERT INTO TABLE dept VALUES (95271, 'DEV', 'BJ');
查看dhfs
[bigdata@hadoop001 data]$ hdfs dfs -ls dept.txt /user/hive/warehouse/test.db/dept
20/12/18 21:17:36 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
ls: `dept.txt': No such file or directory
Found 3 items
-rwxr-xr-x 1 bigdata supergroup 13 2020-12-18 21:16 /user/hive/warehouse/test.db/dept/000000_0
-rwxr-xr-x 1 bigdata supergroup 12 2020-12-18 21:17 /user/hive/warehouse/test.db/dept/000000_0_copy_1
-rw-r--r-- 1 bigdata supergroup 79 2020-12-18 21:14 /user/hive/warehouse/test.db/dept/dept.txt产生多个小文件【生产上不用】
-
将查询结果写到(本地或hdfs)文件系统中
语法:
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
ROW FORMAT row_format
SELECT … FROM …
1 | hive (test)> INSERT OVERWRITE LOCAL DIRECTORY '/home/bigdata/app/hive/data/hivetmp' |
1 | hive (test)> INSERT OVERWRITE DIRECTORY '/data/hivetmp' |
不进入hive直接查看
1 | [bigdata@hadoop001 hivetmp]$ hive -e 'select * from test.emp' |
不使用where进行数据抓取
1 | [bigdata@hadoop001 hivetmp]$ hive -e 'select * from test.emp'|grep SMITH |
EXPORT
1 | The EXPORT and IMPORT commands were added in Hive 0.8.0 (see HIVE-1918). |
导出:包含数据和元数据
语法:
EXPORT TABLE tablename TO ‘export_target_path’
1 | hive (test)> EXPORT TABLE test.emp to '/hive_export/emp'; |
IMPORT
会自动创建表
语法: IMPORT [[EXTERNAL] TABLE new_or_original_tablename
FROM ‘source_path’
[LOCATION ‘import_target_path’]
1 | hive (test)> IMPORT TABLE test.emp_import |
TRUNCATE
【Hive 0.11.0】清空表数据,结构还在, 删除的数据在垃圾桶
语法:
TRUNCATE [TABLE] table_name [PARTITION partition_spec];
1 | hive (test)> truncate table dept2; |
- TRUNCATE 不能清空外部表【non-managed table】
DROP
【Hive 0.11.0】删除表数据和元数据信息
语法:
DROP TABLE table_name [PARTITION partition_spec];
SELECT
语法: 与MySQL相似
-
聚合函数:count max min sum avg
(多进一出)
-
普通函数
hive.exec.mode.local.auto 默认是false,可以设置为true,本地模式,用于测试提升速度
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36hive (test)> select
> e.empno, e.ename, nvl(e.deptno,d.deptno), d.dname
> from
> emp e left join dept d
> on e.deptno = d.deptno;
Automatically selecting local only mode for query
Query ID = bigdata_20201218222727_a5b36452-6bcd-47cd-a63e-a89cc9f1512e
Total jobs = 1
Execution completed successfully
MapredLocal task succeeded
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Job running in-process (local Hadoop)
2020-12-18 22:27:59,993 Stage-3 map = 100%, reduce = 0%
Ended Job = job_local228372600_0005
MapReduce Jobs Launched:
Stage-Stage-3: HDFS Read: 9474 HDFS Write: 112198881 SUCCESS
Total MapReduce CPU Time Spent: 0 msec
OK
e.empno e.ename _c2 d.dname
7369 SMITH 20 RESEARCH
7499 ALLEN 30 SALES
7521 WARD 30 SALES
7566 JONES 20 RESEARCH
7654 MARTIN 30 SALES
7698 BLAKE 30 SALES
7782 CLARK 10 ACCOUNTING
7788 SCOTT 20 RESEARCH
7839 KING 10 ACCOUNTING
7844 TURNER 30 SALES
7876 ADAMS 20 RESEARCH
7900 JAMES 30 SALES
7902 FORD 20 RESEARCH
7934 MILLER 10 ACCOUNTING
8888 HIVE NULL NULL
Time taken: 5.08 seconds, Fetched: 15 row(s)(inner) join匹配左右都有的数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21hive (test)> select
> e.empno, e.ename, d.dname
> from
> emp e join dept d
> on e.deptno = d.deptno;
e.empno e.ename d.dname
7369 SMITH RESEARCH
7499 ALLEN SALES
7521 WARD SALES
7566 JONES RESEARCH
7654 MARTIN SALES
7698 BLAKE SALES
7782 CLARK ACCOUNTING
7788 SCOTT RESEARCH
7839 KING ACCOUNTING
7844 TURNER SALES
7876 ADAMS RESEARCH
7900 JAMES SALES
7902 FORD RESEARCH
7934 MILLER ACCOUNTINGleft join 【左表为基准,没有匹配上的取null】
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21hive (test)> select
> e.empno, e.ename, d.dname
> from
> emp e left join dept d
> on e.deptno = d.deptno;
e.empno e.ename d.dname
7369 SMITH RESEARCH
7499 ALLEN SALES
7521 WARD SALES
7566 JONES RESEARCH
7654 MARTIN SALES
7698 BLAKE SALES
7782 CLARK ACCOUNTING
7788 SCOTT RESEARCH
7839 KING ACCOUNTING
7844 TURNER SALES
7876 ADAMS RESEARCH
7900 JAMES SALES
7902 FORD RESEARCH
7934 MILLER ACCOUNTING
8888 HIVE NULLright join【右表为基准,没有匹配上的取null】
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21hive (test)> select
> e.empno, e.ename, d.dname
> from
> emp e right join dept d
> on e.deptno = d.deptno;
e.empno e.ename d.dname
7782 CLARK ACCOUNTING
7839 KING ACCOUNTING
7934 MILLER ACCOUNTING
7369 SMITH RESEARCH
7566 JONES RESEARCH
7788 SCOTT RESEARCH
7876 ADAMS RESEARCH
7902 FORD RESEARCH
7499 ALLEN SALES
7521 WARD SALES
7654 MARTIN SALES
7698 BLAKE SALES
7844 TURNER SALES
7900 JAMES SALES
NULL NULL OPERATIONSfull join 左右表数据都有
其他情况
- A不包含A与B的交集
1
2
3
4
5
6
7
8
9hive (test)> select
> e.empno, e.ename, d.deptno
> from
> emp e left join dept d
> on e.deptno = d.deptno
> where d.deptno is null;
e.empno e.ename d.deptno
8888 HIVE NULL-
B不包含A与B的交集
1
2
3
4
5
6
7
8
9hive (test)> select
> e.empno, d.deptno, d.deptno
> from
> emp e right join dept d
> on e.deptno = d.deptno
> where e.deptno is null;
e.empno d.deptno d.deptno
NULL 40 40 -
A与B的并集不包含A与B的交集
1
2
3
4
5
6
7
8
9hive (test)> select
> e.empno, e.ename,e.deptno,d.deptno, d.deptno
> from
> emp e full join dept d
> on e.deptno = d.deptno
> where e.deptno is null or d.deptno is null;
e.empno e.ename e.deptno d.deptno d.deptno
8888 HIVE NULL NULL NULL
NULL NULL NULL 40 40-
nvl函数
nvl(e.detpno,d.depto)如果第一个值为null,返回第二的值
-