自定义解析json格式的数据
#创建表 create table json_tb(line string); #导入数据 load data local inpath '/root/test_data/a.json' into table json_tb; #显示原数据 select line from my_tb limit 10; +----------------------------------------------------------------+--+ | json_tb.line | +----------------------------------------------------------------+--+ | {"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"} | | {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"} | | {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"} | | {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"} | | {"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"} | | {"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"} | | {"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"} | | {"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"} | | {"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"} | | {"movie":"919","rate":"4","timeStamp":"978301368","uid":"1"} | +----------------------------------------------------------------+--+ #自定义函数 package com.xiaojie.mm; import org.apache.hadoop.hive.ql.exec.UDF; import parquet.org.codehaus.jackson.map.ObjectMapper; public class JsonParse extends UDF{ public String evaluate(String jsonLine) { ObjectMapper objectMapper = new ObjectMapper(); try { MovieBean bean = objectMapper.readValue(jsonLine, MovieBean.class); return bean.toString(); }catch(Exception e){ } return ""; } } package com.xiaojie.mm; public class MovieBean { // 电影id private String movie; // 电影评分 private String rate; // 评分时间 private String timeStamp; // 用户id private String uid; public String getMovie() { return movie; } public void setMovie(String movie) { this.movie = movie; } public String getRate() { return rate; } public void setRate(String rate) { this.rate = rate; } public String getTimeStamp() { return timeStamp; } public void setTimeStamp(String timeStamp) { this.timeStamp = timeStamp; } public String getUid() { return uid; } public void setUid(String uid) { this.uid = uid; } @Override public String toString() { return this.movie + "\t" +this.rate + "\t" + this.timeStamp + "\t" + this.uid; } } #打jar包上传到hive所在机器,创建函数 add JAR /root/test_data/json_parse.jar; create temporary function json_parse as 'com.xiaojie.mm.JsonParse'; #使用自定义的json解析函数 select json_parse(line) from json_tb limit 10; +---------------------+--+ | _c0 | +---------------------+--+ | 1193 5 978300760 1 | | 661 3 978302109 1 | | 914 3 978301968 1 | | 3408 4 978300275 1 | | 2355 5 978824291 1 | | 1197 3 978302268 1 | | 1287 5 978302039 1 | | 2804 5 978300719 1 | | 594 4 978302268 1 | | 919 4 978301368 1 | +---------------------+--+ #将json解析的数据保存到一张新创建的表里 create table json_parse_tb as select split(json_parse(line),'\t')[0] as movieid, split(json_parse(line),'\t')[1] as rate, split(json_parse(line),'\t')[2] as time, split(json_parse(line),'\t')[3] as userid from json_tb limit 100; #内置json函数 select get_json_object(line,'$.movie') as moiveid, get_json_object(line,'$.rate') as rate, get_json_object(line,'$.timeStamp') as time, get_json_object(line,'$.uid') as userid from json_tb limit 10; Transform(调用自定义脚本)Hive的 TRANSFORM 关键字提供了在SQL中调用自写脚本的功能,适合实现Hive中没有的功能又不想写UDF的情况。
自定义python脚本(vim time_parse.py)
#!/bin/python import sys import datetime for line in sys.stdin: line = line.strip() movieid, rate, unixtime,userid = line.split('\t') weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() print '\t'.join([movieid, rate, str(weekday),userid])将py文件导入到hive的工作目录下
add file time_parse.py使用transform调用自定义的py代码
create TABLE json_parse_time_tb as SELECT #根据transform括号中的参数,将json_parse_tb表的对应数据取出 TRANSFORM (movieid, rate, time, userid) USING 'python time_parse.py' AS (movieid, rate, weekday,userid) FROM json_parse_tb;查看新表数据
select * from json_parse_time_tb; +-----------------------------+--------------------------+-----------------------------+----------------------------+--+ | json_parse_time_tb.movieid | json_parse_time_tb.rate | json_parse_time_tb.weekday | json_parse_time_tb.userid | +-----------------------------+--------------------------+-----------------------------+----------------------------+--+ | 1690 | 3 | 1 | 2 | | 589 | 4 | 1 | 2 | | 3471 | 5 | 1 | 2 | | 1834 | 4 | 1 | 2 | | 2490 | 3 | 1 | 2 | | 2278 | 3 | 1 | 2 | | 110 | 5 | 1 | 2 | | 3257 | 3 | 1 | 2 | | 3256 | 2 | 1 | 2 | | 3255 | 4 | 1 | 2 | +-----------------------------+--------------------------+-----------------------------+----------------------------+--+ 案例原数据(用户名,月份,点击量)
A,2015-01,5 A,2015-01,15 B,2015-01,5 A,2015-01,8 B,2015-01,25 A,2015-01,5 A,2015-02,4 A,2015-02,6 B,2015-02,10 B,2015-02,5求每个人每个月的点击量,以及点击量累计