Skip to content

Commit cd74573

Browse files
author
Nat Taylor
committed
Working version
0 parents  commit cd74573

File tree

5 files changed

+1847
-0
lines changed

5 files changed

+1847
-0
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#php-teaser#
2+
3+
Summarize text or articles into a few bullet points
4+
5+
##Usage###
6+
Basically create an instance of `Teaser()` then pass it either a URL or a text/title pair, and it will return a summary as an array of sentences.
7+
8+
//Ultra-simple Example
9+
$teaser = new Teaser();
10+
$teaser->createSummary("http://www.business2community.com/cloud-computing/confused-saas-paas-iaas-0687173","url"));
11+
12+
##Notes##
13+
- Is there a lot more to do? Yes. Does it basically work? Yes.
14+
- I tried to carefully document the class, but it needs more detail. This is coming soon.
15+
- (Obviously) This relies on the source text having some good sentences that summarize it. Without that, our summary will suck.
16+
- Based on https://github.com/xiaoxu193/PyTeaser based on http://www.textteaser.com/
17+
- What would make this a lot better? Tweaking the scoring, duh!

class.Readability.php

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
<?php
2+
// vim: set et sw=4 ts=4 sts=4 fdm=marker ff=unix fenc=utf8 nobomb:
3+
/**
4+
* PHP Readability
5+
*
6+
* Readability PHP 版本,详见
7+
* http://code.google.com/p/arc90labs-readability/
8+
*
9+
* ChangeLog:
10+
*
11+
* [+] 2011-02-17 初始化版本
12+
*
13+
* @author mingcheng<i.feelinglucky#gmail.com>
14+
* @date 2011-02-17
15+
* @link http://www.gracecode.com/
16+
*/
17+
18+
define("READABILITY_VERSION", 0.12);
19+
20+
class Readability {
21+
// 保存判定结果的标记位名称
22+
const ATTR_CONTENT_SCORE = "contentScore";
23+
24+
// DOM 解析类目前只支持 UTF-8 编码
25+
const DOM_DEFAULT_CHARSET = "utf-8";
26+
27+
// 当判定失败时显示的内容
28+
const MESSAGE_CAN_NOT_GET = "Sorry, readability was unable to parse this page for content. \n
29+
If you feel like it should have been able to,
30+
please let me know by mail: lucky[at]gracecode.com";
31+
32+
// DOM 解析类(PHP5 已内置)
33+
protected $DOM = null;
34+
35+
// 需要解析的源代码
36+
protected $source = "";
37+
38+
// 章节的父元素列表
39+
private $parentNodes = array();
40+
41+
// 需要删除的标签
42+
private $junkTags = Array("style", "form", "iframe", "script", "button", "input", "textarea");
43+
44+
// 需要删除的属性
45+
private $junkAttrs = Array("style", "class", "onclick", "onmouseover", "align", "border", "margin");
46+
47+
48+
/**
49+
* 构造函数
50+
* @param $input_char 字符串的编码。默认 utf-8,可以省略
51+
*/
52+
function __construct($source, $input_char = "utf-8") {
53+
$this->source = $source;
54+
55+
// DOM 解析类只能处理 UTF-8 格式的字符
56+
$source = mb_convert_encoding($source, 'HTML-ENTITIES', $input_char);
57+
58+
// 预处理 HTML 标签,剔除冗余的标签等
59+
$source = $this->preparSource($source);
60+
61+
// 生成 DOM 解析类
62+
$this->DOM = new DOMDocument('1.0', $input_char);
63+
try {
64+
//libxml_use_internal_errors(true);
65+
// 会有些错误信息,不过不要紧 :^)
66+
if (!@$this->DOM->loadHTML('<?xml encoding="'.Readability::DOM_DEFAULT_CHARSET.'">'.$source)) {
67+
throw new Exception("Parse HTML Error!");
68+
}
69+
70+
foreach ($this->DOM->childNodes as $item) {
71+
if ($item->nodeType == XML_PI_NODE) {
72+
$this->DOM->removeChild($item); // remove hack
73+
}
74+
}
75+
76+
// insert proper
77+
$this->DOM->encoding = Readability::DOM_DEFAULT_CHARSET;
78+
} catch (Exception $e) {
79+
// ...
80+
}
81+
}
82+
83+
84+
/**
85+
* 预处理 HTML 标签,使其能够准确被 DOM 解析类处理
86+
*
87+
* @return String
88+
*/
89+
private function preparSource($string) {
90+
// 剔除多余的 HTML 编码标记,避免解析出错
91+
preg_match("/charset=([\w|\-]+);?/", $string, $match);
92+
if (isset($match[1])) {
93+
$string = preg_replace("/charset=([\w|\-]+);?/", "", $string, 1);
94+
}
95+
96+
// Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
97+
$string = preg_replace("/<br\/?>[ \r\n\s]*<br\/?>/i", "</p><p>", $string);
98+
$string = preg_replace("/<\/?font[^>]*>/i", "", $string);
99+
100+
return trim($string);
101+
}
102+
103+
104+
/**
105+
* 删除 DOM 元素中所有的 $TagName 标签
106+
*
107+
* @return DOMDocument
108+
*/
109+
private function removeJunkTag($RootNode, $TagName) {
110+
$Tags = $RootNode->getElementsByTagName($TagName);
111+
112+
$i = 0;
113+
while($Tag = $Tags->item($i++)) {
114+
$parentNode = $Tag->parentNode;
115+
$parentNode->removeChild($Tag);
116+
}
117+
118+
return $RootNode;
119+
}
120+
121+
/**
122+
* 删除元素中所有不需要的属性
123+
*/
124+
private function removeJunkAttr($RootNode, $Attr) {
125+
$Tags = $RootNode->getElementsByTagName("*");
126+
127+
$i = 0;
128+
while($Tag = $Tags->item($i++)) {
129+
$Tag->removeAttribute($Attr);
130+
}
131+
132+
return $RootNode;
133+
}
134+
135+
/**
136+
* 根据评分获取页面主要内容的盒模型
137+
* 判定算法来自:http://code.google.com/p/arc90labs-readability/
138+
*
139+
* @return DOMNode
140+
*/
141+
private function getTopBox() {
142+
// 获得页面所有的章节
143+
$allParagraphs = $this->DOM->getElementsByTagName("p");
144+
145+
// Study all the paragraphs and find the chunk that has the best score.
146+
// A score is determined by things like: Number of <p>'s, commas, special classes, etc.
147+
$i = 0;
148+
while($paragraph = $allParagraphs->item($i++)) {
149+
$parentNode = $paragraph->parentNode;
150+
$contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));
151+
$className = $parentNode->getAttribute("class");
152+
$id = $parentNode->getAttribute("id");
153+
154+
// Look for a special classname
155+
if (preg_match("/(comment|meta|footer|footnote)/i", $className)) {
156+
$contentScore -= 50;
157+
} else if(preg_match(
158+
"/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i",
159+
$className)) {
160+
$contentScore += 25;
161+
}
162+
163+
// Look for a special ID
164+
if (preg_match("/(comment|meta|footer|footnote)/i", $id)) {
165+
$contentScore -= 50;
166+
} else if (preg_match(
167+
"/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i",
168+
$id)) {
169+
$contentScore += 25;
170+
}
171+
172+
// Add a point for the paragraph found
173+
// Add points for any commas within this paragraph
174+
if (strlen($paragraph->nodeValue) > 10) {
175+
$contentScore += strlen($paragraph->nodeValue);
176+
}
177+
178+
// 保存父元素的判定得分
179+
$parentNode->setAttribute(Readability::ATTR_CONTENT_SCORE, $contentScore);
180+
181+
// 保存章节的父元素,以便下次快速获取
182+
array_push($this->parentNodes, $parentNode);
183+
}
184+
185+
$topBox = $this->DOM->createElement('div', Readability::MESSAGE_CAN_NOT_GET);
186+
// Assignment from index for performance.
187+
// See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
188+
for ($i = 0, $len = sizeof($this->parentNodes); $i < $len; $i++) {
189+
$parentNode = $this->parentNodes[$i];
190+
$contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));
191+
$orgContentScore = intval($topBox->getAttribute(Readability::ATTR_CONTENT_SCORE));
192+
193+
if ($contentScore && $contentScore > $orgContentScore) {
194+
$topBox = $parentNode;
195+
}
196+
}
197+
198+
// 此时,$topBox 应为已经判定后的页面内容主元素
199+
return $topBox;
200+
}
201+
202+
203+
/**
204+
* 获取 HTML 页面标题
205+
*
206+
* @return String
207+
*/
208+
public function getTitle() {
209+
$title = $this->DOM->getElementsByTagName("title");
210+
return $title->item(0);
211+
}
212+
213+
214+
/**
215+
* 获取页面的主要内容(Readability 以后的内容)
216+
*
217+
* @return Array
218+
*/
219+
public function getContent() {
220+
if (!$this->DOM) return false;
221+
222+
// 获取页面标题
223+
$ContentTitle = $this->getTitle();
224+
225+
// 获取页面主内容
226+
$ContentBox = $this->getTopBox();
227+
228+
// 复制内容到新的 DOMDocument
229+
$Target = new DOMDocument;
230+
$Target->appendChild($Target->importNode($ContentBox, true));
231+
232+
// 删除不需要的标签
233+
foreach ($this->junkTags as $tag) {
234+
$Target = $this->removeJunkTag($Target, $tag);
235+
}
236+
237+
// 删除不需要的属性
238+
foreach ($this->junkAttrs as $attr) {
239+
$Target = $this->removeJunkAttr($Target, $attr);
240+
}
241+
242+
// 多个数据,以数组的形式返回
243+
return Array(
244+
'title' => $ContentTitle ? $ContentTitle->nodeValue : "",
245+
'content' => mb_convert_encoding($Target->saveHTML(), Readability::DOM_DEFAULT_CHARSET, "HTML-ENTITIES")
246+
);
247+
}
248+
249+
function __destruct() { }
250+
}

0 commit comments

Comments
 (0)