Skip to content

Commit 5bfca7e

Browse files
committed
Initial commit
0 parents  commit 5bfca7e

File tree

9 files changed

+518
-0
lines changed

9 files changed

+518
-0
lines changed

.gitignore

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
syntax: glob
2+
*.class
3+
.metadata
4+
.directory
5+
.keep
6+
.DS_Store
7+
*/\${builder}
8+
9+
**/*.log*
10+
11+
**/*.jar
12+
13+
*/lib/*jar
14+
**/target
15+
**/.settings
16+
target
17+
bin
18+
*~
19+
*.orig
20+
.classpath
21+
.project
22+
.settings
23+
.cproject
24+
.Rhistory
25+
.Rapp.history

README.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Hierarchical Dirichlet Process Gibbs sampling
2+
=============================================
3+
4+
5+
(Re-)Implementation attempt of:
6+
Hierarchical Bayesian Nonparametric Models with Applications.
7+
Y.W. Teh and M.I. Jordan. Bayesian Nonparametrics, 2010. Cambridge University Press.
8+
http://www.gatsby.ucl.ac.uk/~ywteh/research/npbayes/TehJor2010a.pdf
9+
10+

pom.xml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3+
4+
<modelVersion>4.0.0</modelVersion>
5+
<groupId>de.uni-leipzig.informatik.asv</groupId>
6+
<artifactId>hdp</artifactId>
7+
<properties>
8+
<maven.compiler.source>1.6</maven.compiler.source>
9+
<maven.compiler.target>1.6</maven.compiler.target>
10+
<encoding>UTF-8</encoding>
11+
</properties>
12+
13+
<name>Hierarchical Dirichlet Processes</name>
14+
<version>0.0.1-SNAPSHOT</version>
15+
<description>Hierarchical Dirichlet Processes with Gibbs Sampling</description>
16+
<dependencies>
17+
</dependencies>
18+
19+
<build>
20+
<plugins>
21+
</plugins>
22+
</build>
23+
<repositories>
24+
</repositories>
25+
</project>
26+
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package de.uni_leipzig.informatik.asv.hdp;
2+
3+
import java.io.BufferedReader;
4+
import java.io.FileInputStream;
5+
import java.io.FileNotFoundException;
6+
import java.io.InputStream;
7+
import java.io.InputStreamReader;
8+
import java.util.ArrayList;
9+
10+
public class Corpus {
11+
12+
public int sizeVocabulary = 0;
13+
public int totalNumberOfWords = 0;
14+
public ArrayList<Document> docs;
15+
16+
public void read(String filename) throws FileNotFoundException {
17+
18+
InputStream is = new FileInputStream(filename);
19+
int length, word;
20+
Document d;
21+
22+
try {
23+
docs = new ArrayList<Document>();
24+
BufferedReader br = new BufferedReader(new InputStreamReader(is,
25+
"UTF-8"));
26+
String line = null;
27+
while ((line = br.readLine()) != null) {
28+
try {
29+
String[] fields = line.split(" ");
30+
length = Integer.parseInt(fields[0]);
31+
d = new Document(length);
32+
for (int n = 0; n < length; n++) {
33+
String[] wordCounts = fields[n + 1].split(":");
34+
word = Integer.parseInt(wordCounts[0]);
35+
d.words[n] = word;
36+
d.counts[n] = Integer.parseInt(wordCounts[1]);
37+
d.total += Integer.parseInt(wordCounts[1]);
38+
if (word >= sizeVocabulary)
39+
sizeVocabulary = word + 1;
40+
}
41+
totalNumberOfWords += d.total;
42+
docs.add(d);
43+
} catch (Exception e) {
44+
System.err.println(e.getMessage() + "\n");
45+
}
46+
}
47+
} catch (Exception e) {
48+
e.printStackTrace();
49+
}
50+
51+
}
52+
53+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package de.uni_leipzig.informatik.asv.hdp;
2+
3+
import java.util.ArrayList;
4+
import java.util.Collections;
5+
6+
public class DOCState {
7+
8+
static int idCounter = 0;
9+
10+
int docID, documentLength, numberOfTables;
11+
12+
ArrayList<Integer> tableToTopic = new ArrayList<Integer>();
13+
ArrayList<Integer> wordCountByTable = new ArrayList<Integer>();
14+
WordInfo[] words;
15+
16+
17+
public DOCState(Document doc){
18+
docID = idCounter++;
19+
numberOfTables = 0;
20+
documentLength = doc.total;
21+
words = new WordInfo[documentLength];
22+
for (int k = 0; k < 2; k++){
23+
tableToTopic.add(null);
24+
wordCountByTable.add(0);
25+
}
26+
int word, count, m = 0;
27+
for (int n = 0; n < doc.numberOfUniquTerms; n++) {
28+
word = doc.words[n];
29+
count = doc.counts[n];
30+
for (int j = 0; j < count; j++) {
31+
words[m] = new WordInfo(word, -1);
32+
m++;
33+
}
34+
}
35+
}
36+
37+
38+
public void defragment(int[] kOldToKNew) {
39+
int[] tOldToTNew = new int[numberOfTables];
40+
int t, newNumberOfTables;
41+
for (t = 0, newNumberOfTables = 0; t < numberOfTables; t++){
42+
if (wordCountByTable.get(t) > 0){
43+
tOldToTNew[t] = newNumberOfTables;
44+
tableToTopic.set(newNumberOfTables, kOldToKNew[tableToTopic.get(t)]);
45+
Collections.swap(tableToTopic, newNumberOfTables, t);
46+
newNumberOfTables ++;
47+
} else
48+
tableToTopic.set(t, -1);
49+
}
50+
numberOfTables = newNumberOfTables;
51+
for (int i = 0; i < documentLength; i++)
52+
words[i].tableAssignment = tOldToTNew[words[i].tableAssignment];
53+
}
54+
55+
56+
57+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package de.uni_leipzig.informatik.asv.hdp;
2+
3+
public class Document {
4+
5+
public int[] words = null;
6+
public int[] counts = null;
7+
public int numberOfUniquTerms = 0;
8+
public int total = 0;
9+
10+
public Document(int len) {
11+
numberOfUniquTerms = len;
12+
words = new int[numberOfUniquTerms];
13+
counts = new int[numberOfUniquTerms];
14+
}
15+
16+
}

0 commit comments

Comments
 (0)