<?php
/**
* @file recuperer_lien.php
* @brief Page permettant de récupérer les liens d'une page web.
*
* @author hughes monget
* @see http://monget.com/
*/
if (!headers_sent())
{
header('Content-Type: text/html; charset=UTF-8');
}
set_time_limit(0);
$str_liste_url = '';
if (isset($_REQUEST['url']) && is_string($_REQUEST['url']))
{
$str_liste_url = trim($_REQUEST['url']);
if (get_magic_quotes_gpc())
{
$str_liste_url = stripslashes($str_liste_url);
}
$str_liste_url = html_entity_decode($str_liste_url);
}
//var_dump($str_liste_url);
$arr_str_type = array('a', 'img');
$str_type = reset($arr_str_type);
if (isset($_REQUEST['type']) && in_array($_REQUEST['type'], $arr_str_type))
{
$str_type = $_REQUEST['type'];
}
$str_recherche = '';
if (isset($_REQUEST['recherche']) && is_string($_REQUEST['recherche']))
{
$str_recherche = trim($_REQUEST['recherche']);
}
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="fr">
<head>
<title>Récupérer les liens de la page</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<style type="text/css">
<!--
input.url
{
width: 600px;
}
-->
</style>
</head>
<body>
<form method="post" action="">
<table>
<tr>
<td>
<label style="vertical-align:top" for="url">url</label> <textarea rows="5" cols="100" type="text" class="url" id="url" name="url"><?php echo htmlspecialchars($str_liste_url, ENT_QUOTES, 'UTF-8'); ?></textarea>
</td>
</tr>
<tr>
<td>
<input type="submit" value="submit" />
<?php
foreach ($arr_str_type as $tt)
{
$str_checked = ($tt == $str_type) ? ' checked="checked"' : '';
$str_id = 'type_' . $tt;
echo '<input type="radio" id="',$str_id,'" name="type" value="',$tt,'" ',$str_checked,' /> <label for="',$str_id,'">',$tt,'</label>';
}
echo ' <input type="text" name="recherche" value="',htmlspecialchars($str_recherche, ENT_QUOTES, 'UTF-8'),'" />'
?>
</td>
</tr>
</table>
</form>
<?php
$str_liste_url = explode("\n", $str_liste_url);
$str_liste_url = array_unique($str_liste_url);
$str_liste_url = array_map('trim', $str_liste_url);
$str_liste_url = array_filter($str_liste_url, create_function('$s', 'return !!$s;'));
//sort($str_liste_url);
$arr_str_resultat = array();
foreach ($str_liste_url as $str_url)
{
if ($str_url && preg_match('!^http://!', $str_url))
{
$arr_str_url = parse_url($str_url);
$str_host = $arr_str_url['host'];
$str_contenu = file_get_contents($str_url);
//echo '<pre>'.htmlspecialchars($str_contenu).'</pre>';
if ($str_contenu)
{
$str_regex = '//';
switch ($str_type)
{
case 'a': $str_regex = '/<a[^>]+href\s*=\s*"([^>"\s]+)"/iU'; break;
case 'img': $str_regex = '/<img[^>]+src\s*=\s*["\']([^>\n"]+)["\']/iU'; break;
default: exit('type inconnu'); break;
}
if (preg_match_all($str_regex, $str_contenu, $arr_str_match))
{
//ob_start(); var_dump($arr_str_match); echo '<pre>'.htmlspecialchars(ob_get_clean()).'</pre>';
if (isset($arr_str_match[1]) && count($arr_str_match[1]))
{
foreach ($arr_str_match[1] as $str_url)
{
$arr_str_resultat[] = $str_url;
}
}
}
}
}
}
if ($arr_str_resultat)
{
function corriger_url($str_url)
{
global $str_host;
if (!is_string($str_url) || !strlen($str_url) ) { return FALSE; }
if (stripos($str_url, 'http://') === FALSE)
{
$str_separateur = (($str_url{0} != '/') ? '' : '/');
$str_url = 'http://' . $str_host . $str_separateur . $str_url;
}
return $str_url;
}
$arr_str_resultat = array_map('corriger_url', $arr_str_resultat);
$arr_str_resultat = array_unique($arr_str_resultat);
if ($str_recherche)
{
$arr_str_resultat = array_filter($arr_str_resultat, create_function('$s', 'return (FALSE !== strpos($s, \''.addslashes($str_recherche).'\'));'));
}
//sort($arr_str_resultat);
//$arr_str_resultat = array_map('htmlspecialchars', $arr_str_resultat);
echo '<textarea cols="100" rows="',count($arr_str_resultat),'">', implode("\n", $arr_str_resultat), '</textarea>';
}
echo '<hr />';
highlight_file(__FILE__);
?>
</body>
</html>