Skip to content

Commit

Permalink
GPT3 tokenizer (#1869)
Browse files Browse the repository at this point in the history
Add GPT3 tokenizer.

Based on .NET implementation.

---------

Co-authored-by: Luigi Montoya <[email protected]>
Co-authored-by: joe-braley <[email protected]>
Co-authored-by: Luigi96 <[email protected]>
Co-authored-by: Mark Wallace <[email protected]>
Co-authored-by: John Oliver <[email protected]>
Co-authored-by: David Grieve <[email protected]>
  • Loading branch information
7 people authored Jul 18, 2023
1 parent 51c7569 commit e20b6f4
Show file tree
Hide file tree
Showing 12 changed files with 50,603 additions and 1 deletion.
1 change: 1 addition & 0 deletions java/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
<module>semantickernel-extensions-parent</module>
<module>semantickernel-connectors-parent</module>
<module>semantickernel-core-skills</module>
<module>semantickernel-gpt3-tokenizer</module>
</modules>

<dependencyManagement>
Expand Down
5 changes: 5 additions & 0 deletions java/samples/sample-code/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@
<artifactId>semantickernel-sequentialplanner-extension</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-gpt3-tokenizer</artifactId>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<plugins>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package com.microsoft.semantickernel.syntaxexamples;

import com.microsoft.semantickernel.GPT3Tokenizer;
public class Example29_Tokenizer {
/**
* This sample shows how to count tokens using GPT tokenizer. The number of tokens affects
* API calls cost and each model has a maximum amount of tokens it can process and generate.
* This example is specific to OpenAI models, which use the tokenization described here:
* <a href="https://platform.openai.com/tokenizer">OpenAI tokenizer</a>
* If you use Semantic Kernel with other models, the tokenization logic is most probably different,
* and you should not use the GPT tokenizer.
*/
public static void main(String[] args) {
// Example 1
String sentence = "Some text on one line";
int tokenCount = GPT3Tokenizer.encode(sentence).size();

System.out.println("---");
System.out.println(sentence);
System.out.println("Tokens: " + tokenCount);
System.out.println("---\n\n");

// Example 2
sentence = "⭐⭐";
tokenCount = GPT3Tokenizer.encode(sentence).size();

System.out.println("The following example contains emojis which require several tokens.");
System.out.println("---");
System.out.println(sentence);
System.out.println("Tokens: " + tokenCount);
System.out.println("---\n\n");

// Example 3
sentence = "Some text on\ntwo lines";
tokenCount = GPT3Tokenizer.encode(sentence).size();

System.out.println("The following example uses Unix '\\n' line separator.");
System.out.println("---");
System.out.println(sentence);
System.out.println("Tokens: " + tokenCount);
System.out.println("---\n\n");

// Example 4
sentence = "Some text on\r\ntwo lines";
tokenCount = GPT3Tokenizer.encode(sentence).size();

System.out.println("The following example uses Windows '\\r\\n' line separator.");
System.out.println("---");
System.out.println(sentence);
System.out.println("Tokens: " + tokenCount);
System.out.println("---\n\n");

/*
Output:
---
Some text on one line
Tokens: 5
---
The following example contains emojis which require several tokens.
---
⭐⭐
Tokens: 6
---
The following example uses Unix '\n' line separator.
---
Some text on
two lines
Tokens: 6
---
The following example uses Windows '\r\n' line separator.
---
Some text on
two lines
Tokens: 7
---
*/
}
}
1 change: 0 additions & 1 deletion java/semantickernel-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,4 @@
<artifactId>jsr305</artifactId>
</dependency>
</dependencies>

</project>
5 changes: 5 additions & 0 deletions java/semantickernel-bom/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@
<artifactId>semantickernel-actionplanner-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-gpt3-tokenizer</artifactId>
<version>${project.version}</version>
</dependency>


<dependency>
Expand Down
34 changes: 34 additions & 0 deletions java/semantickernel-gpt3-tokenizer/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-parent</artifactId>
<version>0.2.7-alpha-SNAPSHOT</version>
</parent>

<artifactId>semantickernel-gpt3-tokenizer</artifactId>

<dependencies>
<dependency>
<groupId>io.projectreactor</groupId>
<artifactId>reactor-core</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.microsoft.semantic-kernel</groupId>
<artifactId>semantickernel-api</artifactId>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
</dependency>
</dependencies>

</project>
Loading

0 comments on commit e20b6f4

Please sign in to comment.