updated documentation

remstef · remstef · commit 4b176092d955 · 2016-08-12T11:50:28.000+02:00
diff --git a/README.md b/README.md
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+lt.seg/README.MD
diff --git a/lt.seg/README.MD b/lt.seg/README.MD
@@ -1,36 +1,53 @@
+    ###
+    #   Copyright 2015
+    #
+    #   Licensed under the Apache License, Version 2.0 (the "License");
+    #   you may not use this file except in compliance with the License.
+    #   You may obtain a copy of the License at
+    #
+    #       http://www.apache.org/licenses/LICENSE-2.0
+    #
+    #   Unless required by applicable law or agreed to in writing, software
+    #   distributed under the License is distributed on an "AS IS" BASIS,
+    #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    #   See the License for the specific language governing permissions and
+    #   limitations under the License.
+    #
+    ###
+
 ### Prerequisities
 
 * Java v.8
 * (optional) bash v.4
 
 ### How to install
 
-* Download latest version from the releases site [https://github.com/de-tudarmstadt-lt/lt.core/releases][]
+* Download latest version from the releases site [https://github.com/de-tudarmstadt-lt/seg/releases][]
 * unpack into a directory of choice: `tar -xzvf lt.seg-version-dist.tar.gz -C <your-preferred-directory>`
-* (optional) to access `seg.sh` from anywhere you can add <your-preferred-directory>/bin to PATH: `PATH=<your-preferred-directory>:$PATH` or symlink `seg.sh` into a directory which is already in your PATH
+* executables can be found in the `bin` directory, e.g. `bin/seg`. You can execute it from any directory.
+* (optional) to access the `seg` binary from anywhere you can add <your-preferred-directory>/bin to PATH: `PATH=<your-preferred-directory>:$PATH` or symlink `seg` into a directory which is already in your PATH
 
-[https://github.com/de-tudarmstadt-lt/lt.core/releases]: https://github.com/de-tudarmstadt-lt/lt.core/releases "Releases"
 
 *Note: the following description is for unix based systems, you cannot run the startup shell scripts on MS Windows, consider using cygwin or run the java commands manually*
 
 ### How to use the lt.seg segmenter
 
 basic usage is as simple as
 
-		cat text.txt | seg.sh > segmented_text.txt
+		cat text.txt | seg > segmented_text.txt
 
 or 
 
-		seg.sh < text.txt > segmented_text.txt
+		seg < text.txt > segmented_text.txt
 
 or 
 
-		seg.sh -f text.txt > segmented_text.txt
+		seg -f text.txt > segmented_text.txt
 
 
-lt.seg comes with a number of parameters, run `seg.sh -?` to get a list of options
+lt.seg comes with a number of parameters, run `seg -?` to get a list of options
 
-*Note: for MS Windows based systems replace* `seg.sh` *with the correct java command, e.g.* `java -cp lt.seg-<version>-with-dependencies.jar de.tudarmstadt.lt.seg.app.Segmenter <options>`
+*Note: for MS Windows based systems replace* `seg` *with the correct java command, e.g.* `java -cp lt.seg-<version>-with-dependencies.jar de.tudarmstadt.lt.seg.app.Segmenter <options>`
 
 ### Options:
 * `--sentencesplitter <class>` (`-s`):
@@ -41,7 +58,8 @@ lt.seg comes with a number of parameters, run `seg.sh -?` to get a list of optio
 	* `NullSplitter`: Convenience splitter, returns the complete input as one segment
 * `--tokenizer <class>` (`-t`)
 	Sepcify the tokenizer class. Supported values are:
-	* `DiffTokenizer` (default): Applies simple rules based on the change on Unicode category of consecutive characters
+	* `RuleTokenizer` (default): Applies tokenization according to a ruleset specified by the `--token-ruleset` option parameter
+	* `DiffTokenizer`: Applies simple rules based on the change on Unicode category of consecutive characters
 	* `BreakTokenizer`: Java word breakiterator instance
 	* `EmptySpaceTokenizer`: creates a new segment only when empty spaces are found (supported empty spaces include but are not limited to: `<blank>`, `<protected-blank>`, `\t`, `\n`, `\r`, `\f`, ...)
 	* `NullTokenizer`: Convenience tokenizer, returns the complete input as one segment
diff --git a/lt.seg/src/main/java/de/tudarmstadt/lt/seg/app/Segmenter.java b/lt.seg/src/main/java/de/tudarmstadt/lt/seg/app/Segmenter.java
@@ -85,7 +85,7 @@ public Segmenter() {/* NOTHING TO DO */}
 		opts.addOption(OptionBuilder.withLongOpt("tokenizer").withArgName("class").hasArg().withDescription("Specify the class of the word tokinzer that you want to use: {BreakTokenizer, DiffTokenizer, EmptySpaceTokenizer, NullTokenizer} (default: DiffTokenizer)").create("t"));
 		opts.addOption(OptionBuilder.withLongOpt("parallel").withArgName("num").hasArg().withDescription("Specify the number of parallel threads. (Note: output might be genereated in a different order than provided by input, specify 1 if you need to keep the order. Parallel mode requires one document per line [ -l ] (default: 1).").create());
 		opts.addOption(OptionBuilder.withLongOpt("normalize").withDescription("Specify the degree of token normalization [0...4] (default: 0).").hasArg().withArgName("level").create("nl"));
-		opts.addOption(OptionBuilder.withLongOpt("filter").withDescription("Specify the degree of token filtering [0...6] (default: 2).").hasArg().withArgName("level").create("fl"));
+		opts.addOption(OptionBuilder.withLongOpt("filter").withDescription("Specify the degree of token filtering [0...5] (default: 2).").hasArg().withArgName("level").create("fl"));
 		opts.addOption(OptionBuilder.withLongOpt("merge").withDescription("Specify the degree of merging conscutive items {0,1,2} (default: 0).").hasOptionalArg().withArgName("level").create("ml"));
 		opts.addOption(OptionBuilder.withLongOpt("onedocperline").withDescription("Specify if you want to process documents linewise and preserve document ids, i.e. map line numbers to sentences.").create("l"));
 		opts.addOption(OptionBuilder.withLongOpt("sentence-ruleset").withArgName("languagecode").hasArg().withDescription(String.format("Specify the ruleset that you want to use together with RuleSplitter (avaliable: %s) (default: 'default')", de.tudarmstadt.lt.seg.sentence.rules.RuleSet.getAvailable())).create());