openai · DaviReisVieira · May 14, 2024 · May 14, 2024 · May 26, 2024
diff --git a/authors.yaml b/authors.yaml
@@ -97,3 +97,8 @@ justonf:
   name: "Juston Forte"
   website: "https://www.linkedin.com/in/justonforte/"
   avatar: "https://avatars.githubusercontent.com/u/96567547?s=400&u=08b9757200906ab12e3989b561cff6c4b95a12cb&v=4"
+
+davi-reis-vieira:
+  name: "Davi Reis"
+  website: "https://www.linkedin.com/in/davireisvieira/"
+  avatar: "https://avatars.githubusercontent.com/u/36394034?v=4"
diff --git a/examples/How_to_count_tokens_with_tiktoken.ipynb b/examples/How_to_count_tokens_with_tiktoken.ipynb
@@ -22,7 +22,7 @@
     "\n",
     "| Encoding name           | OpenAI models                                       |\n",
     "|-------------------------|-----------------------------------------------------|\n",
-    "| `cl100k_base`           | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large`  |\n",
+    "| `cl100k_base`           | `gpt-4o`, `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large`  |\n",
     "| `p50k_base`             | Codex models, `text-davinci-002`, `text-davinci-003`|\n",
     "| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci`                         |\n",
     "\n",
@@ -110,11 +110,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "encoding = tiktoken.get_encoding(\"cl100k_base\")\n"
+    "encoding = tiktoken.get_encoding(\"cl100k_base\")"
    ]
   },
   {
@@ -127,7 +127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -153,7 +153,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -162,13 +162,13 @@
        "[83, 1609, 5963, 374, 2294, 0]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "encoding.encode(\"tiktoken is great!\")\n"
+    "encoding.encode(\"tiktoken is great!\")"
    ]
   },
   {
@@ -181,20 +181,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "def num_tokens_from_string(string: str, encoding_name: str) -> int:\n",
     "    \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
     "    encoding = tiktoken.get_encoding(encoding_name)\n",
     "    num_tokens = len(encoding.encode(string))\n",
-    "    return num_tokens\n"
+    "    return num_tokens"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -203,13 +203,13 @@
        "6"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "num_tokens_from_string(\"tiktoken is great!\", \"cl100k_base\")\n"
+    "num_tokens_from_string(\"tiktoken is great!\", \"cl100k_base\")"
    ]
   },
   {
@@ -230,7 +230,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -239,13 +239,13 @@
        "'tiktoken is great!'"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "encoding.decode([83, 1609, 5963, 374, 2294, 0])\n"
+    "encoding.decode([83, 1609, 5963, 374, 2294, 0])"
    ]
   },
   {
@@ -266,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -275,13 +275,13 @@
        "[b't', b'ik', b'token', b' is', b' great', b'!']"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "[encoding.decode_single_token_bytes(token) for token in [83, 1609, 5963, 374, 2294, 0]]\n"
+    "[encoding.decode_single_token_bytes(token) for token in [83, 1609, 5963, 374, 2294, 0]]"
    ]
   },
   {
@@ -304,7 +304,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -321,13 +321,12 @@
     "        print()\n",
     "        print(f\"{encoding_name}: {num_tokens} tokens\")\n",
     "        print(f\"token integers: {token_integers}\")\n",
-    "        print(f\"token bytes: {token_bytes}\")\n",
-    "        "
+    "        print(f\"token bytes: {token_bytes}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -352,12 +351,12 @@
     }
    ],
    "source": [
-    "compare_encodings(\"antidisestablishmentarianism\")\n"
+    "compare_encodings(\"antidisestablishmentarianism\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -382,12 +381,12 @@
     }
    ],
    "source": [
-    "compare_encodings(\"2 + 2 = 4\")\n"
+    "compare_encodings(\"2 + 2 = 4\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -412,7 +411,7 @@
     }
    ],
    "source": [
-    "compare_encodings(\"お誕生日おめでとう\")\n"
+    "compare_encodings(\"お誕生日おめでとう\")"
    ]
   },
   {
@@ -422,9 +421,9 @@
    "source": [
     "## 6. Counting tokens for chat completions API calls\n",
     "\n",
-    "ChatGPT models like `gpt-3.5-turbo` and `gpt-4` use tokens in the same way as older completions models, but because of their message-based formatting, it's more difficult to count how many tokens will be used by a conversation.\n",
+    "ChatGPT models like `gpt-3.5-turbo`, `gpt-4` and `gpt-4o` use tokens in the same way as older completions models, but because of their message-based formatting, it's more difficult to count how many tokens will be used by a conversation.\n",
     "\n",
-    "Below is an example function for counting tokens for messages passed to `gpt-3.5-turbo` or `gpt-4`.\n",
+    "Below is an example function for counting tokens for messages passed to `gpt-3.5-turbo`, `gpt-4` or `gpt-4o`.\n",
     "\n",
     "Note that the exact way that tokens are counted from messages may change from model to model. Consider the counts from the function below an estimate, not a timeless guarantee.\n",
     "\n",
@@ -433,7 +432,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -451,6 +450,7 @@
     "        \"gpt-4-32k-0314\",\n",
     "        \"gpt-4-0613\",\n",
     "        \"gpt-4-32k-0613\",\n",
+    "        \"gpt-4o\",\n",
     "        }:\n",
     "        tokens_per_message = 3\n",
     "        tokens_per_name = 1\n",
@@ -475,12 +475,12 @@
     "            if key == \"name\":\n",
     "                num_tokens += tokens_per_name\n",
     "    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>\n",
-    "    return num_tokens\n"
+    "    return num_tokens"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -500,10 +500,6 @@
       "129 prompt tokens counted by num_tokens_from_messages().\n",
       "129 prompt tokens counted by the OpenAI API.\n",
       "\n",
-      "gpt-4-0314\n",
-      "129 prompt tokens counted by num_tokens_from_messages().\n",
-      "129 prompt tokens counted by the OpenAI API.\n",
-      "\n",
       "gpt-4-0613\n",
       "129 prompt tokens counted by num_tokens_from_messages().\n",
       "129 prompt tokens counted by the OpenAI API.\n",
@@ -512,6 +508,10 @@
       "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\n",
       "129 prompt tokens counted by num_tokens_from_messages().\n",
       "129 prompt tokens counted by the OpenAI API.\n",
+      "\n",
+      "gpt-4o\n",
+      "124 prompt tokens counted by num_tokens_from_messages().\n",
+      "124 prompt tokens counted by the OpenAI API.\n",
       "\n"
      ]
     }
@@ -559,9 +559,9 @@
     "    \"gpt-3.5-turbo-0301\",\n",
     "    \"gpt-3.5-turbo-0613\",\n",
     "    \"gpt-3.5-turbo\",\n",
-    "    \"gpt-4-0314\",\n",
     "    \"gpt-4-0613\",\n",
     "    \"gpt-4\",\n",
+    "    \"gpt-4o\",\n",
     "    ]:\n",
     "    print(model)\n",
     "    # example token count from the function defined above\n",

diff --git a/registry.yaml b/registry.yaml
@@ -167,9 +167,10 @@
 
 - title: How to count tokens with Tiktoken
   path: examples/How_to_count_tokens_with_tiktoken.ipynb
-  date: 2022-12-16
+  date: 2024-05-14
   authors:
     - ted-at-openai
+    - davi-reis-vieira
   tags:
     - tiktoken
     - completions