Merge pull request #268 from AntonOsika/ao/new-benchmark

v0.0.4 benchmark, stricter typing, smaller fixes and new README
AntonOsika · Jun 20, 2023 · 95093cd · 95093cd
2 parents 4c68126 + b85ccf5
commit 95093cd
Show file tree

Hide file tree

Showing 8 changed files with 112 additions and 40 deletions.
diff --git a/.gitignore b/.gitignore
@@ -44,6 +44,7 @@ archive
 # any log file
 *log.txt
 todo
+scratchpad
 
 # Ignore GPT Engineer files
 projects

diff --git a/README.md b/README.md
@@ -49,11 +49,6 @@ With an api key that has GPT4 access run:
 **Results**
 - Check the generated files in `projects/my-new-project/workspace`
 
-### Limitations
-Implementing additional chain of thought prompting, e.g. [Reflexion](https://github.com/noahshinn024/reflexion), should be able to make it more reliable and not miss requested functionality in the main prompt.
-
-Contributors welcome! If you are unsure what to add, check out the ideas listed in the Projects tab in the GitHub repo.
-
 
 ## Features
 You can specify the "identity" of the AI agent by editing the files in the `identity` folder.
@@ -63,7 +58,11 @@ Editing the identity, and evolving the `main_prompt`, is currently how you make
 Each step in `steps.py` will have its communication history with GPT4 stored in the logs folder, and can be rerun with `scripts/rerun_edited_message_logs.py`.
 
 ## Contributing
-If you want to contribute, please check out the [projects](https://github.com/AntonOsika/gpt-engineer/projects?query=is%3Aopen) or [issues tab](https://github.com/AntonOsika/gpt-engineer/issues) in the GitHub repo and please read the [contributing document](.github/CONTRIBUTING.md) on how to contribute. Here is our [Discord 💬](https://discord.gg/4t5vXHhu)
+We are building the open platform for devs to tinker with and build their personal code-generation toolbox.
+
+If you want to contribute, please check out the [roadmap](https://github.com/AntonOsika/gpt-engineer/blob/main/ROADMAP.md), [projects](https://github.com/AntonOsika/gpt-engineer/projects?query=is%3Aopen) or [issues tab](https://github.com/AntonOsika/gpt-engineer/issues) in the GitHub repo. You are welcome to read the [contributing document](.github/CONTRIBUTING.md) and join our [Discord 💬](https://discord.gg/4t5vXHhu).
+
+We are currently looking for more maintainers and community organisers. Email [email protected] if you are interested in an official role.
 
 
 ## Example

diff --git a/benchmark/RESULTS.md b/benchmark/RESULTS.md
@@ -4,6 +4,36 @@
 $ python scripts/benchmark.py
 ```
 
+# 2023-06-21
+
+| Benchmark          | Ran | Works | Perfect |
+|--------------------|-----|-------|---------|
+| currency_converter | ✅  | ❌    | ❌      |
+| image_resizer      | ✅  | ✅    | ✅      |
+| pomodoro_timer     | ✅  | ✅    | ✅      |
+| url_shortener      | ✅  | ✅    | ✅      |
+| file_explorer      | ✅  | ✅    | ✅      |
+| markdown_editor    | ✅  | ✅    | ❌      |
+| timer_app          | ✅  | ❌    | ❌      |
+| weather_app        | ✅  | ✅    | ✅      |
+| file_organizer     | ✅  | ✅    | ✅      |
+| password_generator | ✅  | ✅    | ✅      |
+| todo_list          | ✅  | ✅    | ✅      |
+
+
+# Notes on the errors
+Most errors come from that the "generate entrypoint" are incorrect. Ignoring
+those, we get 8/11 fully correct.
+
+All errors are very easy to fix.
+
+One error was trying to modify a constant.
+One error was that the html template was not fully filled in.
+One error is that a dependency was used incorrectly and easy to fix
+
+
+# 2023-06-19
+
 | Benchmark          | Ran | Works | Perfect |
 |--------------------|-----|-------|---------|
 | currency_converter | ❌  | ❌    | ❌      |

diff --git a/gpt_engineer/main.py b/gpt_engineer/main.py
@@ -7,6 +7,7 @@
 
 import typer
 
+from gpt_engineer import steps
 from gpt_engineer.ai import AI
 from gpt_engineer.db import DB, DBs
 from gpt_engineer.steps import STEPS
@@ -20,7 +21,9 @@ def main(
     delete_existing: bool = typer.Argument(False, help="delete existing files"),
     model: str = "gpt-4",
     temperature: float = 0.1,
-    steps_config: str = "default",
+    steps_config: steps.Config = typer.Option(
+        steps.Config.DEFAULT, "--steps", "-s", help="decide which steps to run"
+    ),
     verbose: bool = typer.Option(False, "--verbose", "-v"),
     run_prefix: str = typer.Option(
         "",

diff --git a/gpt_engineer/steps.py b/gpt_engineer/steps.py
@@ -2,6 +2,8 @@
 import re
 import subprocess
 
+from enum import Enum
+
 from gpt_engineer.ai import AI
 from gpt_engineer.chat_to_files import to_files
 from gpt_engineer.db import DBs
@@ -34,10 +36,10 @@ def clarify(ai: AI, dbs: DBs):
             break
 
         print()
-        user = input('(answer in text, or "q" to move on)\n')
+        user = input('(answer in text, or "c" to move on)\n')
         print()
 
-        if not user or user == "q":
+        if not user or user == "c":
             break
 
         user += (
@@ -145,10 +147,16 @@ def execute_entrypoint(ai, dbs):
     print()
     print('If yes, press enter. Otherwise, type "no"')
     print()
-    if input() != "":
+    if input() not in ["", "y", "yes"]:
         print("Ok, not executing the code.")
         return []
     print("Executing the code...")
+    print(
+        "\033[92m"  # green color
+        + "Note: If it does not work as expected, please consider running the code'"
+        + " in another way than above."
+        + "\033[0m"
+    )
     print()
     subprocess.run("bash run.sh", shell=True, cwd=dbs.workspace.path)
     return []
@@ -165,6 +173,8 @@ def gen_entrypoint(ai, dbs):
             "b) run all necessary parts of the codebase (in parallell if necessary).\n"
             "Do not install globally. Do not use sudo.\n"
             "Do not explain the code, just give the commands.\n"
+            "Do not use placeholders, use example values (like . for a folder argument) "
+            "if necessary.\n"
         ),
         user="Information about the codebase:\n\n" + dbs.workspace["all_output.txt"],
     )
@@ -183,7 +193,7 @@ def use_feedback(ai: AI, dbs: DBs):
         ai.fassistant(dbs.workspace["all_output.txt"]),
         ai.fsystem(dbs.identity["use_feedback"]),
     ]
-    messages = ai.next(messages, dbs.memory["feedback"])
+    messages = ai.next(messages, dbs.input["feedback"])
     to_files(messages[-1]["content"], dbs.workspace)
     return messages
 
@@ -201,35 +211,61 @@ def fix_code(ai: AI, dbs: DBs):
     return messages
 
 
+class Config(str, Enum):
+    DEFAULT = "default"
+    BENCHMARK = "benchmark"
+    SIMPLE = "simple"
+    TDD = "tdd"
+    TDD_PLUS = "tdd+"
+    CLARIFY = "clarify"
+    RESPEC = "respec"
+    EXECUTE_ONLY = "execute_only"
+    USE_FEEDBACK = "use_feedback"
+
+
 # Different configs of what steps to run
 STEPS = {
-    "default": [simple_gen, gen_entrypoint, execute_entrypoint],
-    "benchmark": [simple_gen, gen_entrypoint],
-    "simple": [simple_gen, gen_entrypoint, execute_entrypoint],
-    "tdd": [gen_spec, gen_unit_tests, gen_code, gen_entrypoint, execute_entrypoint],
-    "tdd+": [
+    Config.DEFAULT: [
+        clarify,
+        gen_clarified_code,
+        gen_entrypoint,
+        execute_entrypoint,
+    ],
+    Config.BENCHMARK: [simple_gen, gen_entrypoint],
+    Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
+    Config.TDD: [
+        gen_spec,
+        gen_unit_tests,
+        gen_code,
+        gen_entrypoint,
+        execute_entrypoint,
+    ],
+    Config.TDD_PLUS: [
         gen_spec,
         gen_unit_tests,
         gen_code,
         fix_code,
         gen_entrypoint,
         execute_entrypoint,
     ],
-    "clarify": [clarify, gen_clarified_code, gen_entrypoint, execute_entrypoint],
-    "respec": [
+    Config.CLARIFY: [
+        clarify,
+        gen_clarified_code,
+        gen_entrypoint,
+        execute_entrypoint,
+    ],
+    Config.RESPEC: [
         gen_spec,
         respec,
         gen_unit_tests,
         gen_code,
         gen_entrypoint,
         execute_entrypoint,
     ],
-    "execute_only": [execute_entrypoint],
-    "use_feedback": [use_feedback],
+    Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint],
+    Config.EXECUTE_ONLY: [gen_entrypoint, execute_entrypoint],
 }
 
 # Future steps that can be added:
-# self_reflect_and_improve_files,
-# add_tests
-# run_tests_and_fix_files,
-# improve_based_on_in_file_feedback_comments
+# run_tests_and_fix_files
+# execute_entrypoint_and_fix_files_if_needed
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
 
 [project]
 name = "gpt-engineer"
-version = "0.0.3"
+version = "0.0.4"
 description = "Specify what you want it to build, the AI asks for clarification, and then builds it."
 readme = "README.md"
 requires-python = ">=3"

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -35,7 +35,7 @@ def main(
                     "-m",
                     "gpt_engineer.main",
                     bench_folder,
-                    "--steps-config",
+                    "--steps",
                     "benchmark",
                 ],
                 stdout=log_file,
@@ -66,7 +66,7 @@ def main(
                     "-m",
                     "gpt_engineer.main",
                     bench_folder,
-                    "--steps-config",
+                    "--steps",
                     "execute_only",
                 ],
             )

diff --git a/scripts/print_chat.py b/scripts/print_chat.py
@@ -16,19 +16,22 @@ def pretty_print_conversation(messages):
     }
     formatted_messages = []
     for message in messages:
-        assistant_content = (
-            message["function_call"]
-            if message.get("function_call")
-            else message["content"]
-        )
-        role_to_message = {
-            "system": f"system: {message['content']}\n",
-            "user": f"user: {message['content']}\n",
-            "assistant": f"assistant: {assistant_content}\n",
-            "function": f"function ({message['name']}): {message['content']}\n",
-        }
-
-        formatted_messages.append(role_to_message[message["role"]])
+        if message["role"] == "function":
+            formatted_messages.append(
+                f"function ({message['name']}): {message['content']}\n"
+            )
+        else:
+            assistant_content = (
+                message["function_call"]
+                if message.get("function_call")
+                else message["content"]
+            )
+            role_to_message = {
+                "system": f"system: {message['content']}\n",
+                "user": f"user: {message['content']}\n",
+                "assistant": f"assistant: {assistant_content}\n",
+            }
+            formatted_messages.append(role_to_message[message["role"]])
 
     for formatted_message in formatted_messages:
         print(