-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfilter.go
98 lines (81 loc) · 1.89 KB
/
filter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
package wikipedia
import (
"regexp"
"strings"
)
type CategoryFilter struct {
Pattern string
Allowed []string
Denied []string
}
func (c *CategoryFilter) CategoryRegexp() *regexp.Regexp {
return regexp.MustCompile(c.Pattern)
}
type TitleLengthFilter struct {
Min int
Max int
}
type TitleFilter struct {
AllowContain []string
RemoveContain []string
RemoveExact []string
}
func FilterByTitleLength(pages <-chan Page, filtered chan<- Page, filter TitleLengthFilter) {
for page := range pages {
if len(page.Title) >= filter.Min && len(page.Title) <= filter.Max {
filtered <- page
}
}
close(filtered)
}
func FilterByCategory(pages <-chan Page, filtered chan<- Page, filter CategoryFilter) {
inAnyCategory := func(page_categories string, categories []string) bool {
for _, cat := range categories {
if strings.Index(page_categories, cat) != -1 {
return true
}
}
return false
}
for page := range pages {
categories := page.Categories(filter.CategoryRegexp())
if inAnyCategory(categories, filter.Allowed) && !inAnyCategory(categories, filter.Denied) {
filtered <- page
}
}
close(filtered)
}
func FilterByRedirect(pages <-chan Page, filtered chan<- Page) {
for page := range pages {
if len(page.Redir.Title) == 0 {
filtered <- page
}
}
close(filtered)
}
func FilterByTitle(pages <-chan Page, filtered chan<- Page, filter TitleFilter) {
inArray := func(s string, arr []string) bool {
for _, match := range arr {
if s == match {
return true
}
}
return false
}
containsAny := func(s string, arr []string) bool {
for _, match := range arr {
if strings.Index(s, match) != -1 {
return true
}
}
return false
}
for page := range pages {
if containsAny(page.Title, filter.AllowContain) &&
!containsAny(page.Title, filter.RemoveContain) &&
!inArray(page.Title, filter.RemoveExact) {
filtered <- page
}
}
close(filtered)
}