Skip to content

Commit 00e2d46

Browse files
author
Florian Bersier
committed
new file: scrapifox.php
1 parent 1dc7223 commit 00e2d46

File tree

1 file changed

+122
-0
lines changed

1 file changed

+122
-0
lines changed

scrapifox.php

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
<?php
2+
3+
/*
4+
Author: Florian Bersier (xpressyoo)
5+
Date: September 2011
6+
URL: https://github.com/xpressyoo/ScrAPI
7+
8+
===========================================
9+
10+
This work is licensed under the Creative Commons Attribution 2.0 UK: England & Wales License. To view a copy of this license, visit http://creativecommons.org/licenses/by/2.0/uk/ or send a letter to Creative Commons, 444 Castro Street, Suite 900, Mountain View, California, 94041, USA.
11+
12+
*/
13+
14+
session_start(); ?>
15+
<html>
16+
<head>
17+
<style type="text/css">
18+
body{margin:4% 0 0 3%}
19+
ul li{list-style:none;display:inline-block;margin:0;padding:0;font-family:Arial,sans-serif;font-size:16px}
20+
li:first-child{margin-right:20px;border-right:1px solid #333;width:250px;color:#666}
21+
li{list-style:none;display:inline-block;padding:5px 10px;line-height:30px;vertical-align:top}
22+
h4{background-color:lightyellow;padding:3px 7px}p{font-size:14px;color:#999}
23+
</style>
24+
</head>
25+
<body>
26+
<?php
27+
28+
$errors=0;
29+
$base = $_SERVER['HTTP_REFERER'];
30+
$ip = $_SERVER['REMOTE_ADDR'];
31+
$browser = $_SERVER['HTTP_USER_AGENT'];
32+
$language = $_SERVER['HTTP_ACCEPT_LANGUAGE'];
33+
$date = date('j/m/Y');
34+
$time = date('G:i');
35+
$etime = date('B');
36+
37+
if(isset($_POST['submit']))//Retrieve the URL from an INPUT field
38+
{
39+
40+
//GET URL
41+
$url = "https://addons.mozilla.org/en-US/firefox/addon/".$_POST['url'];
42+
43+
//REVIEWS AND PAGES
44+
$file_string = file_get_contents($url.'/reviews/');
45+
preg_match('#<b>(.*?)</b>#i', $file_string, $nb);
46+
$nbreviews = str_replace(",", "",$nb[1]);
47+
$pages0 = round(($nbreviews/20));
48+
$pages1 = round(($nbreviews/20),1);
49+
$pages = $pages0 - $pages1;
50+
51+
if ($pages >= 0){$pages = $pages0;}
52+
else{$pages = $pages0 + 1;}
53+
54+
55+
//USERS
56+
$file_string2 = file_get_contents($url);
57+
preg_match('#<b>(.*?)</b>#i', $file_string2, $users);
58+
$nbusers = str_replace(",", "",$users[1]);
59+
60+
//RATIO
61+
$ratio = $nbreviews/$nbusers;
62+
63+
//INITIALIZATION
64+
$add = 0;
65+
66+
//LOOP REVIEWS SCRAPING
67+
68+
for ($i = 1; $i <= $pages; $i++) {
69+
$oldSetting = libxml_use_internal_errors( true );
70+
libxml_clear_errors();
71+
72+
$html = new DOMDocument();
73+
$html->loadHtmlFile($url.'/reviews/?page='.$i);
74+
75+
$xpath = new DOMXPath( $html );
76+
$links = $xpath->query("//div[contains(@class, 'review')and
77+
not(contains(@class,'reply'))]"); //Do not include the comments written by the addon's developer
78+
79+
$return = array();
80+
81+
foreach ( $links as $item ) {
82+
$newDom = new DOMDocument;
83+
$newDom->appendChild($newDom->importNode($item,true));
84+
85+
$xpath = new DOMXPath( $newDom );
86+
$review = str_replace("\"","",trim($xpath->query("//p[@class='review-body']")->item(0)->nodeValue));
87+
//$review = "\"".$review."\",";
88+
$return[] = array($review,);
89+
}
90+
91+
// REVIEWS ARRAY
92+
$return = print_r($return,true);
93+
$return = htmlspecialchars($return);
94+
$return = str_replace("[0]", "", $return);
95+
$return = str_replace("Array", "", $return);
96+
$return = str_replace("(", "", $return);
97+
$return = str_replace(")", "", $return);
98+
$return = str_replace("=", "", $return);
99+
$return = str_replace("&gt;", "", $return);
100+
$return = str_replace("[", "", $return);
101+
$return = str_replace("]", "", $return);
102+
$vowels = array("1", "2", "3", "4", "5", "6", "7", "8", "9", "0", " ");
103+
$return = str_replace($vowels, "", $return);
104+
$strlen = strlen($return);
105+
$add += $strlen;
106+
107+
108+
libxml_clear_errors();
109+
libxml_use_internal_errors( $oldSetting );
110+
}
111+
112+
//DISPLAY RESULTS
113+
114+
$final = round($add/$nbreviews,6);
115+
116+
echo "<h4>".$url. "</h4><ul><li>Number of Reviews<br />Number of Pages<br />Number of Users<br />Ratio Reviews/Users<br />Total number of strings<br />Avg number of string per review</li><li>". $nbreviews."<br />". $pages . "<br />".$nbusers."<br />".$ratio."<br />". $add . "<br />". $final . "</li></ul><br /><p>Data retrieved on ".$date." at ".$time." from ".$browser."</p>";
117+
118+
}
119+
120+
?>
121+
</body>
122+
</html>

0 commit comments

Comments
 (0)