Kita bahas langsung satu persatu gan
1. Penentu url(alamat web) tujuan.
Bagian ini akan mengambil 1 url dari database yang belum diproses.
1. Penentu url(alamat web) tujuan.
Bagian ini akan mengambil 1 url dari database yang belum diproses.
function db_get_url()3. Pemarsing (pemroses) hasil downloadan Fungsi parseHTML akan menerima string html kemudian mengekstrak semua link yang ada di string tersebut. Link tadi akan diambil domainnya saja untuk kemudian disimpan ke dalam database.
{
$sql = "SELECT id, url FROM tbl_url WHERE status='0' ORDER BY id LIMIT 1";
$rs = mysql_query($sql);
$url = '';
if ($data = mysql_fetch_array($rs))
{
$url = $data['url'];
$sql = "UPDATE tbl_url SET status='1' WHERE id='". $data['id']. "' ";
$rs = mysql_query($sql);
}
if ($url == '') $url = 'http://planet.terasi.net';
return $url;
}
function parseHTML($html)Berikut ini adalah source code lengkapnya.
{
if (preg_match_all("/<a href=\"(.*?)\"/i", $html, $match)) {
foreach ($match[1] as $row) {
$domain = getDomain($row);
if ($domain != '')
{
db_insert_url($domain);
}
}
}
}
<?phpJangan lupa buat tabelnya juga di databasenya
$db = mysql_connect('localhost', 'phpkita', 'phpkita');
mysql_select_db('db_phpkita', $db);
//loop terus aja
while (true)
{
$url = db_get_url();
$html = getURL($url);
db_update_html($url, $html);
parseHTML($html);
}
mysql_close($db);
exit;
/*
* fungsi-fungsi
*/
function db_get_url()
{
$sql = "SELECT id, url FROM tbl_url WHERE status='0' ORDER BY id LIMIT 1";
$rs = mysql_query($sql);
$url = '';
if ($data = mysql_fetch_array($rs))
{
$url = $data['url'];
$sql = "UPDATE tbl_url SET status='1' WHERE id='". $data['id']. "' ";
$rs = mysql_query($sql);
}
if ($url == '') $url = 'http://planet.terasi.net';
return $url;
}
function getURL($url, $delay=0) {
$result = "";
$url = trim($url);
$delay = intval($delay);
if ($url != "") {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
if ($delay != 0) {
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60);
curl_setopt($ch, CURLOPT_TIMEOUT, $delay);
}
else {
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
}
$result = curl_exec($ch);
curl_close($ch);
}
return($result);
}
function parseHTML($html)
{
if (preg_match_all("/<a href=\"(.*?)\"/i", $html, $match)) {
foreach ($match[1] as $row) {
$domain = getDomain($row);
if ($domain != '')
{
db_insert_url($domain);
}
}
}
}
function db_insert_url($url)
{
$url = mysql_real_escape_string($url);
$sql = "INSERT INTO tbl_url (url, html, status) VALUES ('$url', '', '0')";
$rs = mysql_query($sql);
}
function db_update_html($url, $html)
{
$url = mysql_real_escape_string($url);
$html = mysql_real_escape_string($html);
$sql = "UPDATE tbl_url SET html='$html' WHERE url='$url' ";
$rs = mysql_query($sql);
}
function getDomain($url)
{
$result = '';
if (preg_match("/^(http:\/\/[\d|\w|-|_|.]+)/i", $url, $match)) {
$result = $match[1];
}
return $result;
}
?>
CREATE TABLE `tbl_url` (sumber : http://phpkita.wordpress.com
`id` int(8) NOT NULL AUTO_INCREMENT,
`url` varchar(128) NOT NULL,
`html` text NOT NULL,
`status` int(1) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`),
UNIQUE KEY `url` (`url`)
) ENGINE=MyISAM;
Tag :
PHP,
Programming
0 Komentar untuk "Membuat Web Crawler Sederhana"