一千萬個為什麽

搜索

簡化代碼以加速php scraper

代碼簡單地插入到頁面中,並從指定的表中獲取所有表內容,將其插入到我的數據庫中並回顯它。

它做得非常慢,我需要想法來簡化它以更快地工作

<?php

設置循環

$pagenumber = 1001;

while ($pagenumber <= 5000) {

得到內容

$url = "http://www.example.com/info.php?num=$pagenumber";
$raw = file_get_contents($url);

$newlines = array("\t","\n","\r"," ","\0","\x0B");
$content = str_replace($newlines, '', $raw);

$start = strpos($content,'>Details<');
$end = strpos($content,'</table>',$start);
$table1 = substr($content,$start,$end-$start);
// $table1 = strip_tags($table1);

獲得名字

$start = strpos($table1,'<td');
$end = strpos($table1,'
',$start); $fnames = substr($table1,$start,$end-$start); $fnames = strip_tags($fnames); $fnames = preg_replace('/\s\s+/', '', $fnames);

得到姓氏

$start = strpos($table1,'
'); $end = strpos($table1,'</td>',$start); $lnames = substr($table1,$start,$end-$start); $lnames = strip_tags($lnames); $lnames = preg_replace('/\s\s+/', '', $lnames);

拿到電話

$start = strpos($table1,'Phone:');
$end = strpos($table1,'</td>              </tr>              <tr>',$start);
$phone = substr($table1,$start,$end-$start);
$phone = strip_tags($phone);
$phone = str_replace("Phone:", "" ,$phone);
$phone = preg_replace('/\s\s+/', '', $phone);

gets the address

$start = strpos($table1,'Address:');
$end = strpos($table1,'</td>              </tr>              <tr>',$start);
$ad = substr($table1,$start,$end-$start);
$ad = strip_tags($ad);
$ad = str_replace("Address:", "" ,$ad);
$ad = preg_replace('/\s\s+/', '', $ad);

gets the apartment no

$start = strpos($table1,'Apt:');
$end = strpos($table1,'</td>              </tr>              <tr>',$start);
$apt = substr($table1,$start,$end-$start);
$apt = strip_tags($apt);
$apt = str_replace("Apt:", "" ,$apt);
$apt = preg_replace('/\s\s+/', '', $apt);

gets the country

$start = strpos($table1,'Country:');
$end = strpos($table1,'</td>              </tr>              <tr>',$start);
$country = substr($table1,$start,$end-$start);
$country = strip_tags($country);
$country = str_replace("Country:", "" ,$country);
$country = preg_replace('/\s\s+/', '', $country);

gets the city

$start = strpos($table1,'City:
State/Province:'); $end = strpos($table1,'</td> </tr> <tr>',$start); $city = substr($table1,$start,$end-$start); $city = strip_tags($city); $city = str_replace("City: State/Province:", "" ,$city); $city = preg_replace('/\s\s+/', '', $city);

gets the zip

$start = strpos($table1,'Zip:');
$end = strpos($table1,'</td>              </tr>              <tr>',$start);
$zip = substr($table1,$start,$end-$start);
$zip = strip_tags($zip);
$zip = str_replace("Zip:", "" ,$zip);
$zip = preg_replace('/\s\s+/', '', $zip);

gets the email

$start = strpos($table1,'email:');
$end = strpos($table1,'</td>              </tr>',$start);
$email = substr($table1,$start,$end-$start);
$email = strip_tags($email);
$email = str_replace("email:", "" ,$email);
$email = preg_replace('/\s\s+/', '', $email);

回應那一行

echo "<tr>
<td>link</td>
<td>$fnames</td>
<td>$lnames</td>
<td>$phone</td>
<td>$ad</td>
<td>$apt</td>
<td>$country</td>
<td>$city</td>
<td>$zip</td>
<td>$email</td>
</tr>";

包括數據庫信息

include("inf.php");
$tablename = 'list';

$fnames = mysql_real_escape_string($fnames);
$lnames = mysql_real_escape_string($lnames);
$phone = mysql_real_escape_string($phone);
$ad = mysql_real_escape_string($ad);
$apt = mysql_real_escape_string($apt);
$country = mysql_real_escape_string($country);
$city = mysql_real_escape_string($city);
$zip = mysql_real_escape_string($zip);
$email = mysql_real_escape_string($email);

將行插入db

$query = "INSERT INTO $tablename VALUES('', '$pagenumber', '$fnames', '$lnames', '$phone', '$ad', 

'$apt','$country','$city','$zip', '$email')";
mysql_query($query) or die(mysql_error()); 

re設置循環

$pagenumber = $pagenumber + 1;
}

?>

最佳答案

Don't use regex for html. You should use xpath, and for PHP specifically, DOMXPath

轉載註明原文: 簡化代碼以加速php scraper