我想写一个脚本,将包含在这些链接中的所有数据转储到本地文件中。
有人用 PHP 做了吗?一般准则和 gotchas 足以作为答案。

嗯。不要p HTML with regexes。
这是一个受 Tatu 启发的 DOM 版本:
<?php
function crawl_page($url, $depth = 5)
{
static $seen = array();
if (isset($seen[$url]) || $depth === 0) {
return;
}
$seen[$url] = true;
$dom = new DOMDocument('1.0');
@$dom->loadHTMLFile($url);
$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
if (extension_loaded('http')) {
$href = http_build_url($url, array('path' => $path));
} else {
$parts = p_url($url);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['p'])) {
$href .= $parts['user'] . ':' . $parts['p'] . '@';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .= dirname($parts['path'], 1).$path;
}
}
crawl_page($href, $depth - 1);
}
echo "URL:",$url,PHP_EOL,"CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL;
}
crawl_page("http://hobodave.com", 2);
编辑:我修复了 Tatu 版本中的一些错误(现在使用相对 URL)。
编辑:我添加了一点新功能,可以防止它两次跟随相同的 URL。
编辑:现在将输出回显到 STDOUT,以便您可以将其重定向到所需的任何文件
编辑:修复了 George 在他的答案中指出的错误。相对 url 将不再附加到 url 路径的末尾,而是覆盖它。感谢 George 为此。请注意,George 的答案不包含任何内容:https,user,p 或 port。如果您加载了httpGeorge 扩展,则可以使用http_build_url手动完成。
这里我的实现基于上面的例子 / 答案。
它是基于类
使用 Curl
支持 HTTP Auth
跳过不属于基本域的 URL
返回每个页面的 Http 标头响应代码
每页的返回时间
裂纹类:
cl crawler
{
protected $_url;
protected $_depth;
protected $_host;
protected $_useHttpAuth = false;
protected $_user;
protected $_p;
protected $_seen = array();
protected $_filter = array();
public function __construct($url, $depth = 5)
{
$this->_url = $url;
$this->_depth = $depth;
$p = p_url($url);
$this->_host = $p['host'];
}
protected function _processAnchors($content, $url, $depth)
{
$dom = new DOMDocument('1.0');
@$dom->loadHTML($content);
$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
if (extension_loaded('http')) {
$href = http_build_url($url, array('path' => $path));
} else {
$parts = p_url($url);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['p'])) {
$href .= $parts['user'] . ':' . $parts['p'] . '@';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .= $path;
}
}
// Crawl only link that belongs to the start domain
$this->crawl_page($href, $depth - 1);
}
}
protected function _getContent($url)
{
$handle = curl_init($url);
if ($this->_useHttpAuth) {
curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
curl_setopt($handle, CURLOPT_USERPWD, $this->_user . ":" . $this->_p);
}
// follows 302 redirect, creates problem wiht authentication
// curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE);
// return the content
curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
/* Get the HTML or whatever is linked in $url. */
$response = curl_exec($handle);
// response total time
$time = curl_getinfo($handle, CURLINFO_TOTAL_TIME);
/* Check for 404 (file not found). */
$httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
curl_close($handle);
return array($response, $httpCode, $time);
}
protected function _printResult($url, $depth, $httpcode, $time)
{
ob_end_flush();
$currentDepth = $this->_depth - $depth;
$count = count($this->_seen);
echo "N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>";
ob_start();
flush();
}
protected function isValid($url, $depth)
{
if (strpos($url, $this->_host) === false
|| $depth === 0
|| isset($this->_seen[$url])
) {
return false;
}
foreach ($this->_filter as $excludePath) {
if (strpos($url, $excludePath) !== false) {
return false;
}
}
return true;
}
public function crawl_page($url, $depth)
{
if (!$this->isValid($url, $depth)) {
return;
}
// add to the seen URL
$this->_seen[$url] = true;
// get Content and Return Code
list($content, $httpcode, $time) = $this->_getContent($url);
// print Result for current Page
$this->_printResult($url, $depth, $httpcode, $time);
// process subPages
$this->_processAnchors($content, $url, $depth);
}
public function setHttpAuth($user, $p)
{
$this->_useHttpAuth = true;
$this->_user = $user;
$this->_p = $p;
}
public function addFilterPath($path)
{
$this->_filter[] = $path;
}
public function run()
{
$this->crawl_page($this->_url, $this->_depth);
}
}
用法:
// USAGE
$startURL = 'http://YOUR_URL/';
$depth = 6;
$username = 'YOURUSER';
$pword = 'YOURPASS';
$crawler = new crawler($startURL, $depth);
$crawler->setHttpAuth($username, $pword);
// Exclude path with the following structure to be processed
$crawler->addFilterPath('customer/account/login/referer');
$crawler->run();
在它的最简单的形式:
function crawl_page($url, $depth = 5) {
if($depth > 0) {
$html = file_get_contents($url);
preg_match_all('~<a.*?href="(.*?)".*?>~', $html, $matches);
foreach($matches[1] as $newurl) {
crawl_page($newurl, $depth - 1);
}
file_put_contents('results.txt', $newurl."\n\n".$html."\n\n", FILE_APPEND);
}
}
crawl_page('http://www.domain.com/index.php', 5);
该函数将从页面获取内容,然后抓取所有找到的链接并将内容保存到“results.txt”。这些函数接受第二个参数,深度,它定义了链接应该遵循的长度。如果你只想解析给定页面的链接,请在那里传递 1。
本站系公益性非盈利分享网址,本文来自用户投稿,不代表码文网立场,如若转载,请注明出处
评论列表(30条)