写了个收罗论坛的小措施。内里包罗了模仿登岸,获取页面源代码,正则匹配功效等部门,但愿对各人有用。
这个是09年的对象了,例子中的论坛已经不存在,是不是discuz我也不知道。不外照旧可以学到一些对象。
<?php // 吴燕军 // 2009-06-27 // 收罗措施php set_time_limit(0); // cookie生存目次 $cookie_jar = '/tmp/cookie.tmp'; /** * 函数------------------------------------------------------------------------------------------------------------ */ // 模仿请求数据 function request($url, $postfields, $cookie_jar, $referer) { $ch = curl_init(); $options = array(CURLOPT_URL => $url, CURLOPT_HEADER => 0, CURLOPT_NOBODY => 0, CURLOPT_PORT => 80, CURLOPT_POST => 1, CURLOPT_POSTFIELDS => $postfields, CURLOPT_RETURNTRANSFER => 1, CURLOPT_FOLLOWLOCATION => 1, CURLOPT_COOKIEJAR => $cookie_jar, CURLOPT_COOKIEFILE => $cookie_jar, CURLOPT_REFERER => $referer ); curl_setopt_array($ch, $options); $code = curl_exec($ch); curl_close($ch); return $code; } // 获取帖子列表 function getThreadsList($code) { preg_match_all('/ <!--[.|\r|\n]*? <a href=http://enenba.com/\"viewthread.php\?tid=(\d+)/', $code, $threads); return $threads[1]; } // 判定该帖子是否存在 function isExits($code) { preg_match('/ <p>指定的主题不存在或已被删除或正在被审核,请返回。 <\/p>/', $code, $error); return isset($error[0])?false:true; } // 获取帖子标题 function getTitle($code) { preg_match('/ <h1>[^ <\/h1>]*/', $code, $title_tmp); $title = $title_tmp[0]; return $title; } // 获取帖子作者: function getAuthor($code) { preg_match('/ <a href=http://enenba.com/\"space.php\?uid=http://enenba.com/\d+\" target=http://enenba.com/\"_blank\" id=http://enenba.com/\"userinfo\d+\" onmouseover=http://enenba.com/\"showMenu\(this\.id\)\">.+/', $code, $author_tmp); $author = strip_tags($author_tmp[0]); return $author; } // 获取楼主颁发的内容 function getContents($code) { preg_match('/ <div id=http://enenba.com/\"postmessage_\d+\" class=http://enenba.com/\"t_msgfont\">(.|\r|\n)*? <\/div>/', $code, $contents_tmp); $contents = preg_replace('/images\//', 'http://bbs.war3.cn/images/', $contents_tmp[0]); return $contents; } // 打印帖子标题 function printTitle($title) { echo " <strong> <h2>帖子标题: </h2> </strong>", strip_tags($title), " <br/> <br/>"; } // 输出帖子作者 function printAuthor($author) { echo " <strong> <h2>帖子作者: </h2> </strong>", strip_tags($author), " <br/> <br/>"; } // 打印帖子内容 function printContents($contents) { echo " <strong> <h2>作者颁发的内容: </h2>", $contents, " </strong> <br/>"; } // 错误 function printError() { echo " <i>该帖子不存在! </i>"; } /** * 函数列表end--------------------------------------------------------------------------------------------------- */ /** * 登录论坛 begin */ $url = 'http://bbs.war3.cn/logging.php?action=login'; $postfields = 'loginfield=username&username=1nject10n&password=xxxxxx&questionid=0&cookietime=315360000&referer=http://bbs.war3.cn/&loginsubmit=提交'; request($url, $postfields, $cookie_jar, ''); unset($postfields, $url); /** * 登录论坛 end */ /** * 获取帖子列表(位于第一页的帖子) begin */ $url = 'http://bbs.war3.cn/forumdisplay.php?fid=57'; $code = request($url, '', $cookie_jar, ''); $threadsList = getThreadsList($code); /** * 获取帖子列表 end */ // 帖子序列 $rows = 0; /** * 轮回抓取所有帖子源代码 begin */ foreach($threadsList as $list) { $url = "http://bbs.war3.cn/viewthread.php?tid=$list"; if (isExits($code)) { $code = request($url, '', $cookie_jar, ''); $color = $rows % 2 == 0?'#00CCFF':'#FFFF33'; echo " <div>"; echo " <h1>第", ($rows + 1), "贴: </h1> <br/>"; $author = getAuthor($code); printAuthor($author); $title = getTitle($code); printTitle($title); $contents = getContents($code); printContents($contents); echo " </div>"; $rows++; } else printError(); echo "----------------------------------------------------------------------------------------- <br/> <br/>"; } /** * 抓取源代码 end */ ?>end