Pig简单的代码实例:报表统计行业中的点击和曝(2)


--join socialData(metadata_social) and logData(distinct_origin_historical_age):
joinedByGUID = JOIN social_age BY guid_social, distinct_origin_historical_age BY guid;
--(guid_social, age; xxx_ad_id,guid,log_type)


--generating analyzing age data
joined_orgin_age_data = FOREACH joinedByGUID GENERATE xxx_ad_id,guid,log_type,age;
joinedByAdId = JOIN joined_ad_campaign_data BY adId, joined_orgin_age_data BY xxx_ad_id; --(adId,industryId,brandId,xxx_ad_id,guid,log_type,age)
--filtering
all_current_data = FOREACH joinedByAdId GENERATE guid,log_type,industryId,brandId,age; --(guid,log_type,industryId,brandId,age)


--for industry analyzing
industry_current_data = FOREACH all_current_data GENERATE industryId,guid,age,log_type;  --(industryId,guid,age,log_type)


--load all in the path "industry"
industry_existed_Data = LOAD '$Industry_Path' USING PigStorage(',') AS (industryId:chararray,guid:chararray,age:chararray,log_type:chararray);


--merge with history data
union_Industry = UNION industry_existed_Data, industry_current_data;
distict_union_industry = DISTINCT union_Industry;
group_industry = GROUP distict_union_industry BY ($2,$0,$3);
count_guid_for_industry = FOREACH group_industry GENERATE FLATTEN(group),COUNT($1.$1);


rm $Industry_SUM;
STORE count_guid_for_industry INTO '$Industry_SUM' USING PigStorage(',');


--storing union industry data(current and history)
STORE distict_union_industry INTO '$Industry_TMP' USING PigStorage(',');
rm $Industry_Path
mv $Industry_TMP $Industry_Path


--counting guid for industry and brand
industry_brand_current = FOREACH all_current_data GENERATE age,industryId,brandId,log_type,guid;
--(age,industryId,brandId,log_type,guid)


--load history data of industry_brand
industry_brand_history = LOAD '$Industry_Brand_Path' USING PigStorage(',') AS(age:chararray, industryId:chararray, brandId:chararray, log_type:chararray, guid:chararray);


--union all data of industry_brand
union_industry_brand = UNION industry_brand_current,industry_brand_history;
unique_industry_brand = DISTINCT union_industry_brand;
--(age,industryId,brandId,log_type,guid)


--counting users' number for industry and brand
group_industry_brand = GROUP unique_industry_brand BY ($0,$1,$2,$3);
count_guid_for_industry_brand = FOREACH group_industry_brand GENERATE FLATTEN(group),COUNT($1.$4);


rm $Industry_Brand_SUM;
STORE count_guid_for_industry_brand INTO '$Industry_Brand_SUM' USING PigStorage(',');


STORE unique_industry_brand INTO '$Industry_Brand_TMP' USING PigStorage(',');
rm $Industry_Brand_Path;
mv $Industry_Brand_TMP $Industry_Brand_Path


--counting user number for age and logtype
current_data = FOREACH all_current_data GENERATE age,log_type,guid;--(age,log_type,guid)


--load history data of age and logtype
history_data = LOAD '$ALL_Path' USING PigStorage(',') AS(age:chararray,log_type:chararray,guid:chararray);


--union current and history data
union_all_data = UNION history_data, current_data;
unique_all_data = DISTINCT union_all_data;


--count users' number
group_all_data = GROUP unique_all_data BY ($0,$1);
count_guid_for_age_logtype = FOREACH group_all_data GENERATE FLATTEN(group),COUNT($1.$2);


rm $ALL_SUM;
STORE count_guid_for_age_logtype INTO '$ALL_SUM' USING PigStorage(',');


STORE unique_all_data INTO '$ALL_TMP' USING PigStorage(',');
rm $ALL_Path
mv $ALL_TMP $ALL_Path

Pig 的安装与测试

Pig安装与配置教程

Pig 安装部署及MapReduce模式下测试

Pig安装及本地模式测试,体验

Pig的安装配置与基本使用

Hadoop Pig进阶语法

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/b8a038ef18218f7465c6baa6d19fd9d8.html