ETL项目1:大数据采集,清洗,处理:使用MapReduce进行离线数据分析完整项目

ETL项目1:大数据采集,清洗,处理:使用MapReduce进行离线数据分析完整项目 思路分析:

ETL项目1:大数据采集,清洗,处理:使用MapReduce进行离线数据分析完整项目

ETL项目1:大数据采集,清洗,处理:使用MapReduce进行离线数据分析完整项目

 

1.1 log日志生成

ETL项目1:大数据采集,清洗,处理:使用MapReduce进行离线数据分析完整项目

用curl模拟请求,nginx反向代理80端口来生成日志.

#! /bin/bash function get_user_agent(){ a0='User-Agent:MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1' a1='User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' a2='User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2' a3='User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; QQBrowser/7.0.3698.400)' a4='User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER' a5='User-Agent:Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5' a6='User-Agent:Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5' a7='User-Agent:Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5' a8='User-Agent:Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1' a9='User-Agent:Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1' a10='User-Agent:Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1' a11='User-Agent:Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1' a12='User-Agent:MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1' a13='User-Agent:MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1' a14='User-Agent:MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1' agent_arr=("$a0" "$a1" "$a2" "$a3" "$a4" "$a5" "$a6" "$a7" "$a8" "$a9" "$a10" "$a11" "$a12" "$a13" "$a14") echo "${agent_arr[$((RANDOM % 14 ))]}" } #获取小时,将09 转化为9 function get_hour(){ hour=`date +%H` [ ${hour:0:1} -eq '0' ] && echo ${hour:1:1} || echo $hour } #uid1--10000,循环一次,没有国家,每10秒请求一次 function send_1_10000_for1_sleep10_notwithcountry(){ i=1 break_num=1; while [ true ] do #if i > 10000; i = 1 if [ $i -gt 1000 ]; then i=1; # break_num++ ((break_num++)) if [ $break_num -eq 2 ]; then exit; fi fi #造值i的md5,作为uid uid=`echo $i | md5sum | awk '{print $1}'` #user-agent user_agent=`get_user_agent` echo "user_agent:$user_agent" /usr/bin/curl -s -o /dev/null -H "${user_agent}" ":80?uid=${uid}" #i++ ((i++)) sleep 2 done } #uid5000--15000,循环一次,每10秒请求一次,每500条有一个带有country参数的请求 function send_5000_15000_for1_sleep6_withcountry500(){ i=500 break_num=1; while [ true ] do #if i > 10000; i = 1 if [ $i -gt 1500 ]; then i=1; # break_num++ ((break_num++)) if [ $break_num -eq 2 ]; then exit; fi fi #造值i的md5,作为uid uid=`echo $i | md5sum | awk '{print $1}'` #user-agent user_agent=`get_user_agent` echo "user_agent:$user_agent" #每500个发一次带有country的参数 if [ $((i%50)) -eq 0 ];then #国家码 c_arr=('CN' 'CN' 'CN' 'CN' 'CN' 'US' 'GE' 'GB' 'FR' 'KR' 'AR' 'RU' 'SE' 'SG') country=${c_arr[$((RANDOM % 14 ))]} #echo "country:$country" /usr/bin/curl -s -o /dev/null -H "${user_agent}" ":80?uid=${uid}&country=${country}" else /usr/bin/curl -s -o /dev/null -H "${user_agent}" ":80?uid=${uid}" fi #i++ ((i++)) sleep 2 done } #uid1000--20000,循环一次,有国家,每3秒请求一次 function send_1_20000_for1_sleep3_withcountry(){ i=1 break_num=0; while [ true ] do #if i > 10000; i = 1 if [ $i -gt 2000 ]; then i=1; # break_num++ ((break_num++)) if [ $break_num -eq 2 ]; then exit; fi fi #造值i的md5,作为uid uid=`echo $i | md5sum | awk '{print $1}'` #国家码 c_arr=('CN' 'CN' 'CN' 'CN' 'CN' 'US' 'GE' 'GB' 'FR' 'KR' 'AR' 'RU' 'SE' 'SG') country=${c_arr[$((RANDOM % 14 ))]} echo "country:$country" #user-agent user_agent=`get_user_agent` echo "user_agent:$user_agent" /usr/bin/curl -s -o /dev/null -H "${user_agent}" ":80?uid=${uid}&country=${country}" #i++ ((i++)) sleep 1 done } `send_1_10000_for1_sleep10_notwithcountry` `send_5000_15000_for1_sleep6_withcountry500` `send_1_20000_for1_sleep3_withcountry`

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/wpwssj.html