-
Notifications
You must be signed in to change notification settings - Fork 10
/
bad-word-detector.py
101 lines (88 loc) · 2.75 KB
/
bad-word-detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import sys
import time
try:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
except ImportError:
print '[!] You need to install nltk (http://nltk.org/index.html)'
def calculate_languages_ratios(text):
languages_ratios = {}
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
def detect_language(text):
ratios = calculate_languages_ratios(text)
most_rated_language = max(ratios, key=ratios.get)
return most_rated_language
def load_bad_words(language):
if language.upper() in ['ENGLISH','FRENCH','SPANISH','GERMAN']:
try:
badwords_list=[]
lang_file = open('datasets/'+language.lower()+'.csv','rb')
for word in lang_file:
badwords_list.append(word.lower().strip('\n'))
return badwords_list
except:
return False
finally:
lang_file.close()
else:
return False
def load_file(filename):
file=open(filename,'rb')
return file
filename = sys.argv[1]
print 'Input File Name : '+filename
try:
input_file = load_file(filename)
text = ''
line_count=1
for i in input_file:
text+=str(line_count)+'| '+i
line_count+=1
print '\n'
time.sleep(2)
print '-----------------Input Text-----------------'
print text
print '--------------------------------------------\n'
except Exception,e:
print 'Error Occured while loading text file. Error : '+str(e)
finally:
input_file.close()
language = detect_language(text)
print '\n'
time.sleep(1)
print '----------------------------'
print 'Language Deteced : ',language.upper()
print '----------------------------'
print '\n'
time.sleep(1)
print 'Checking for bad words in '+language.upper()+' language...'
print '**********************************************************\n'
try:
badwords = load_bad_words(language)
badwords = set(badwords)
except Exception,e:
print 'Error Occured in Program - Error : '+str(e)
text_list = text.split('\n')
for sentence in text_list:
line_number = str(text_list.index(sentence)+1)
for key in ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']:
sentence = sentence.replace(key,'')
abuses=[i for i in sentence.lower().split() if i in badwords]
if abuses == []:
continue
else:
time.sleep(0.5)
print '-- '+str(len(abuses))+' Bad Words found at line number : '+line_number+' --'
x_words=''
for i in abuses:
x_words+=i+', '
print 'Bad Words : '+x_words[:-2]
print '-----------------\n'