-
Notifications
You must be signed in to change notification settings - Fork 8
/
index.js
108 lines (95 loc) · 3.46 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/*
* www.openedcaptions.com routes captions from C-Span 1 channel to a socket end point.
* This script serves as an itnermediate server to buffer text from socket and expose it as REST API end point.
* * that also support char offset. see README for more info.
* author: Dan Z @impronunciable
*/
const io = require('socket.io-client')
const fs = require('fs')
const http = require('http')
const URL = require('url')
const s = require('underscore.string')
const parseCsv = require('csv-parse/lib/sync')
// Load proper noun dictionary
const words = parseCsv(fs.readFileSync('words.csv'))
// Where we stash our stuff
var cache = []
// Setup a cache buster so our cache doesn't use all the memory
const ttl = 20 * 60 * 1000 // 20 mins -> microseconds
const cacheCheckInterval = 5 * 60 * 1000 // 5 mins -> microseconds
setInterval(cleanCache, cacheCheckInterval)
// Setup a transcription file, if desired
var txt = false;
if ( process.env.TRANSCRIPT_FILE ) {
const transcriptFile = process.env.TRANSCRIPT_FILE
if ( fs.existsSync(transcriptFile) ) {
cache.push({t: Date.now(), r: fs.readFileSync(transcriptFile)})
}
txt = fs.createWriteStream(transcriptFile, {flag: 'a'})
}
const socket = io.connect('https://openedcaptions.com:443')
socket.on('content', data => {
if ( txt ) { txt.write(data.data.body) }
if ( data.data.body === "\r\n" ) { return }
const dat = {t: Date.now(), r: data.data.body}
console.log(dat.t, dat.r)
cache.push(dat)
})
http.createServer((req, res) => {
const url = URL.parse(req.url, true)
if ( url.pathname != '/' ) {
res.writeHead(404, { 'Content-Type': 'text/plain' })
res.end('404 not found')
} else {
const now = Date.now()
const timestamp = parseInt(url.query.since || 0)
res.setHeader('Content-Type', 'application/json')
res.end(JSON.stringify({
now: now,
captions: formatText(getWordsSince(timestamp))
}))
}
}).listen(process.env.PORT || 5000)
function formatText(str) {
var ret = str.toLowerCase().replace("\r\n", ' ') // remove random line breaks
ret = s.clean(ret) // remove redundant spaces
// now use our words file to do a bunch of stuff
words.forEach((pair) => {
ret = ret
.replace(new RegExp(` ${pair[0].replace('.', '\\.')}( |\\.|,|:|')`, 'gi'), (match, a) => { return ` ${pair[1]}${a}` })
.replace(new RegExp(`^${pair[0]}( |\\.|,|:|')`, 'i'), (match, a) => { return `${pair[1]}${a}` })
.replace(new RegExp(` ${pair[0]}$`, 'i'), pair[1])
})
ret = ret
// Music notes
.replace(/\s+b\x19\*\s+/, '\n\n🎵\n\n')
// remove blank space before puncuation
.replace(/\s+(!|\?|;|:|,|\.|')/g, '$1')
// handle honorifics
.replace(/ (sen\.?|rep\.?|mr\.?|mrs\.?|ms\.?|dr\.?) (\w)/gi,
(match, a, b) => { return ` ${s.capitalize(a)} ${b.toUpperCase()}` })
// Cap first letter of sentences
.replace(/(!|\?|:|\.|>>)\s+(\w)/g, (match, a, b) => { return `${a} ${b.toUpperCase()}` })
// >> seems to be used instead of repeating speaker prompts in back and forths
.replace(/\s*>>\s*/g, "\n\n>> ")
// Put speaker prompts on new lines
.replace(/(\.|"|!|\?|—)\s*([a-zA-Z. ]{2,30}:)/g, '$1\n\n$2')
return ret
}
function getWordsSince(timestamp) {
var ret = []
cache.forEach((val, i) => {
if ( val.t >= parseInt(timestamp) ) {
ret.push(val.r)
}
})
return ret.join(' ')
}
function cleanCache() {
const ttl_check = Date.now() - ttl
cache.forEach((val, i) => {
if ( val.t < ttl_check ) {
delete cache[i]
}
})
}