Berbagi Artikel-Artikel Unik Dan Bermutu

Blog Archive

Copyright © 2015 Artikel TopNews | . Powered by Blogger.

Labels

ads3

tag

Top Artikel

ads2

ads

iklan

Membuat Web Crawler Sederhana

Kita bahas langsung satu persatu gan

1. Penentu url(alamat web) tujuan.
Bagian ini akan mengambil 1 url dari database yang belum diproses.

function db_get_url()
{
$sql = "SELECT id, url FROM tbl_url WHERE status='0' ORDER BY id LIMIT 1";
$rs = mysql_query($sql);
$url = '';
if ($data = mysql_fetch_array($rs))
{
$url = $data['url'];

$sql = "UPDATE tbl_url SET status='1' WHERE id='". $data['id']. "' ";
$rs = mysql_query($sql);
}
if ($url == '') $url = 'http://planet.terasi.net';

return $url;
}
3. Pemarsing (pemroses) hasil downloadan Fungsi parseHTML akan menerima string html kemudian mengekstrak semua link yang ada di string tersebut. Link tadi akan diambil domainnya saja untuk kemudian disimpan ke dalam database.
function parseHTML($html)
{
if (preg_match_all("/<a href=\"(.*?)\"/i", $html, $match)) {
foreach ($match[1] as $row) {
$domain = getDomain($row);
if ($domain != '')
{
db_insert_url($domain);
}
}
}
}
Berikut ini adalah source code lengkapnya.
<?php

$db = mysql_connect('localhost', 'phpkita', 'phpkita');
mysql_select_db('db_phpkita', $db);

//loop terus aja
while (true)
{
$url = db_get_url();
$html = getURL($url);
db_update_html($url, $html);
parseHTML($html);
}

mysql_close($db);
exit;

/*
* fungsi-fungsi
*/
function db_get_url()
{
$sql = "SELECT id, url FROM tbl_url WHERE status='0' ORDER BY id LIMIT 1";
$rs = mysql_query($sql);
$url = '';
if ($data = mysql_fetch_array($rs))
{
$url = $data['url'];

$sql = "UPDATE tbl_url SET status='1' WHERE id='". $data['id']. "' ";
$rs = mysql_query($sql);
}
if ($url == '') $url = 'http://planet.terasi.net';

return $url;
}

function getURL($url, $delay=0) {
$result = "";
$url = trim($url);
$delay = intval($delay);
if ($url != "") {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
if ($delay != 0) {
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60);
curl_setopt($ch, CURLOPT_TIMEOUT, $delay);
}
else {
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
}
$result = curl_exec($ch);
curl_close($ch);
}

return($result);
}

function parseHTML($html)
{
if (preg_match_all("/<a href=\"(.*?)\"/i", $html, $match)) {
foreach ($match[1] as $row) {
$domain = getDomain($row);
if ($domain != '')
{
db_insert_url($domain);
}
}
}
}

function db_insert_url($url)
{
$url = mysql_real_escape_string($url);
$sql = "INSERT INTO tbl_url (url, html, status) VALUES ('$url', '', '0')";
$rs = mysql_query($sql);
}

function db_update_html($url, $html)
{
$url = mysql_real_escape_string($url);
$html = mysql_real_escape_string($html);
$sql = "UPDATE tbl_url SET html='$html' WHERE url='$url' ";
$rs = mysql_query($sql);
}

function getDomain($url)
{
$result = '';
if (preg_match("/^(http:\/\/[\d|\w|-|_|.]+)/i", $url, $match)) {
$result = $match[1];
}
return $result;
}
?>
Jangan lupa buat tabelnya juga di databasenya
CREATE TABLE `tbl_url` (
`id` int(8) NOT NULL AUTO_INCREMENT,
`url` varchar(128) NOT NULL,
`html` text NOT NULL,
`status` int(1) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`),
UNIQUE KEY `url` (`url`)
) ENGINE=MyISAM;
sumber : http://phpkita.wordpress.com
Tag : PHP, Programming
0 Komentar untuk "Membuat Web Crawler Sederhana"
Back To Top