-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathcompanynames.py
116 lines (95 loc) · 2.71 KB
/
companynames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Generate random Web2.0 company names, along with a plausibility rating
for each.
Run companynames.sh first to grab the data.
(Quick-and-dirty hack.)
"""
import math
import operator
import random
import re
N = 3 # Markov model order
nwords = 10000 # How many names to generate
min_length = 5 # Print only names at least this long
max_length = 10 # and no longer than this
penalize_length = True # If true, longer names get worse ratings
def main():
for i in range(nwords):
name = gen_name()
print '%5.2f %s' % (cross_entropy(name), name.encode('utf8'))
def cross_entropy(name):
p = model_probability(name)
e = -math.log10(p)
return e if penalize_length else e/len(name)
def model_probability(name):
return product(cPw(ng[-1], ng[:-1])
for ng in ngrams(wrap(name)))
def product(numbers):
return reduce(operator.mul, numbers, 1)
def cPw(letter, prefix):
choices = collect_choices(prefix)
return float(choices[letter]) / sum(choices.values())
def gen_name():
for candidate in gen_lots():
if is_acceptable(candidate):
return candidate
def is_acceptable(candidate):
return (candidate.isalpha()
and min_length <= len(candidate) <= max_length
and candidate not in companynames)
def gen_lots():
while True:
yield gen()
def gen():
state = '#' * N
rv = ''
while len(rv) <= 25:
state = state[1:]
letter = pick_next(state)
if letter == '#':
break
rv += letter
state += letter
return rv
def pick_next(state):
return weighted_choice(collect_choices(state))
def collect_choices(state):
choices = {}
for c in alphabet:
if state+c in counts:
choices[c] = counts[state+c]
return choices
def weighted_choice(choices):
r = random.randint(0, sum(choices.values()))
for choice, k in choices.items():
r -= k
if r <= 0:
return choice
raise Exception("Can't happen")
def ngrams(string):
return [string[i:i+N] for i in range(len(string) - N + 1)]
def clean(string):
return re.sub(r'&', '&', string.lower())
def wrap(name):
return '#'*(N-1) + name + '#'
companynames = set(clean(unicode(line.rstrip('\n'), 'utf8'))
for line in open('companynames'))
counts = {}
for name in companynames:
for ngram in ngrams(wrap(name)):
counts[ngram] = counts.get(ngram, 0) + 1
alphabet = set(''.join(counts.keys()))
## gen()
#. u'hertionet'
main()
#. 1.14 senes
#. 1.28 actuvus
#. 0.86 prons
#. 0.98 softwevox
#. 0.94 zooks
#. 1.39 litspa
#. 1.26 auffby
#. 0.96 edion
#. 0.83 stedia
#. 0.89 tecties
#.