create table test( name string, friends array, children map, address struct ) row format delimited fields terminated by ',' collection items terminated by '_' map keys terminated by ':' lines terminated by '\n';
row format delimited fields terminated by ‘,’ – 列分隔符
collection items terminated by ‘_’ --MAP STRUCT 和 ARRAY 的分隔符(数据分割符号)
map keys terminated by ‘:’ – MAP中的key与value的分隔符
lines terminated by ‘\n’; – 行分隔符
1 2
hive (default)> select friends[1],children['xiao song'],address.city from test where name="songsong";
如果文件数据是纯文本,可以使用STORED AS TEXTFILE。如果数据需要压缩,使用 STORED AS SEQUENCEFILE。
(9)LOCATION :指定表在HDFS上的存储位置。
(10)AS:后跟查询语句,根据查询结果创建表。
(11)LIKE允许用户复制现有的表结构,但是不复制数据
创建表
1 2 3 4 5 6 7 8
create table if not exists student2( id int, name string ) row format delimited fields terminated by '\t' stored as textfile location '/user/hive/warehouse/student2'; -- 直接创建
create table if not exists student3 as select id, name from student; -- 使用as子句
create table dept_partition (deptno int, dname string, loc string) partitioned by (month string) row format delimited fields terminated by '\t';
查询分区表的分区
1
show partitions dept_partition;
向表中的插入数据
1 2 3 4
load data local inpath '/opt/module/data/01.txt' into table dept_partition partition (month = '01'); load data local inpath '/opt/module/data/02.txt' into table dept_partition partition (month = '02');
-- 添加分区 alter table dept_partition add partition(month = '03');
-- 直接修复 msck repair table dept_partition;
-- 上传数据直接带分区 load data local inpath '/opt/module/data/03.txt' into table dept_partition partition (month = '03');
建立二级分区表
1 2 3 4 5 6
create table dept_partition2 (deptno int, dname string, loc string) partitioned by (month string, day string) row format delimited fields terminated by '\t';
x表示从哪个bucket开始抽取,如果需要取多个分区,以后的分区号为当前分区号加上y。例如,table总bucket数为4,tablesample(bucket 1 out of 2),表示总共抽取(4/2=)2个bucket的数据,抽取第1(x)个和第3(x+y)个bucket的数据。
注意:x的值必须小于等于y的值,否则会提示如下错误:
FAILED: SemanticException [Error 10061]: Numerator should not be bigger than denominator in sample clause for table stu_buckFAILED: SemanticException [Error 10061]: Numerator should not be bigger than denominator in sample clause for table stu_buck
函数操作
常用查询函数
nvl()
1
select comm, nvl(comm, -1) from emp; -- nvl会讲comm列为空的值替换为-1
case when
1 2 3 4 5 6 7 8
select dept_id, sum(case sex when '男' then 1 else 0 end) male_count, sum(case sex when '女' then 1 else 0 end) female_count from emp_sex group by dept_id;
concat(),字符串连接函数
concat_ws(),将字符串用分隔符连接起来
collect_set(),将分组后的某一个字段进行汇总,变成array
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+-------------------+----------------------------+-------------------------+ | person_info.name | person_info.constellation | person_info.blood_type | +-------------------+----------------------------+-------------------------+ | 孙悟空 | 白羊座 | A | | 大海 | 射手座 | A | | 宋宋 | 白羊座 | B | | 猪八戒 | 白羊座 | A | | 凤姐 | 射手座 | A | | 苍老师 | 白羊座 | B | +-------------------+----------------------------+-------------------------+
select base.type, concat_ws('|', collect_set(base.name)) as name from (select name, concat_ws(',', constellation, blood_type) as type from person_info) base group by base.type;
explode(),将hive一列中复杂的array或者map拆分成多行
lateral view udtf(expression) tableAlias as columnAlias
+----------------+---------------------+----------------+ | business.name | business.orderdate | business.cost | +----------------+---------------------+----------------+ | jack | 2017-01-01 | 10 | | tony | 2017-01-02 | 15 | | jack | 2017-02-03 | 23 | | tony | 2017-01-04 | 29 | | jack | 2017-01-05 | 46 | | jack | 2017-04-06 | 42 | | tony | 2017-01-07 | 50 | | jack | 2017-01-08 | 55 | | mart | 2017-04-08 | 62 | | mart | 2017-04-09 | 68 | | neil | 2017-05-10 | 12 | | mart | 2017-04-11 | 75 | | neil | 2017-06-12 | 80 | | mart | 2017-04-13 | 94 | +----------------+---------------------+----------------+
查询在2017年4月份购买过的顾客及总人数
1 2 3 4 5
select name, count(*) over() as people_sum -- over()限定了count()聚合函数的范围,表示为当前select 出来的东西进行计数,所以开窗函数的作用就是给聚合函数提供了限制范围 from business where substring(orderdate, 1, 7) = '2017-04' group by name;
查询2017年4月顾客的购买明细及月购买总额
1 2 3 4
select name, orderdate, cost, sum(cost) over(partition by name) as sum_cost -- partition by name 表示了sum函数的聚合范围为不同的name进行聚合 from business where substring(orderdate, 1, 7) = '2017-04';
上述的场景, 将每个顾客的cost按照日期进行累加
1 2 3 4 5 6 7 8 9 10 11
select name, orderdate, cost, sum(cost) over(partition by name order by orderdate asc rows between unbounded preceding and current row) as sum_cost -- order by限定了以订单日期进行排序,rows between xx and yy 限定了哪些行进行累加 from business where substring(orderdate, 1, 7) = '2017-04'; -- 升序累加
select name, orderdate, cost, sum(cost) over(partition by name order by orderdate desc rows between current row and unbounded following) as sum_cost from business where substring(orderdate, 1, 7) = '2017-04'; -- 降序累加
求消费明细以及每个月有哪些顾客来过
1 2 3
select name,orderdate,cost, collect_set(name) over(partition by substring(orderdate, 1, 7)) people from business;
查看顾客上次的购买时间
1 2 3
select name, orderdate, cost, lag(orderdate, 1, '1970-01-01') over (partition by name order by orderdate) last_order -- lag后面必须跟一个排序窗口 from business;
查询前20%订单的信息
1 2 3 4 5 6 7 8 9 10 11
select * from ( select name,orderdate,cost, ntile(5) over(order by orderdate) sorted -- ntile函数会把窗口分成几份,然后把份数填进去 from business ) t where sorted = 1;
select * from (select name,orderdate,cost, percent_rank() over (order by orderdate) pr from business) a where a.pr <= 0.2;
关于窗口限定范围的详细范例
1 2 3 4 5 6 7 8 9
select name,orderdate,cost, sum(cost) over() as sample1,--所有行相加 sum(cost) over(partition by name) as sample2,--按name分组,组内数据相加 sum(cost) over(partition by name order by orderdate) as sample3,--按name分组,组内数据累加 sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--和sample3一样,由起点到当前行的聚合 sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING and current row) as sample5, --当前行和前面一行做聚合 sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行 sum(cost) over(partition by name order by orderdate rows between current row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行 from business;
哪些顾客两天内来过我的店
1 2 3 4 5 6 7
select distinct name from ( select name,orderdate, lag(orderdate, 1) over(partition by name order by orderdate asc) last_date from business ) a where datediff(orderdate, last_date) = 2;
select *, rank() over(partition by subject order by score desc), -- rank会根据窗口里的排序给予对应的序号,排序相同时序号会相同,总数不会少 dense_rank() over(partition by subject order by score desc), -- dense_rank会根据窗口里的排序给予对应的序号,排序相同时序号会相同,但是不会累加,会导致排名减少 row_number() over (partition by subject order by score desc) -- 给每一行进行编号,按照顺序给予号码 from score;