From 28a300c5784415af66f81b2acc0db182f6eb3bbd Mon Sep 17 00:00:00 2001
From: Enoch Cheung <enoch@enochc.com>
Date: Mon, 4 Mar 2024 09:31:49 -0800
Subject: [PATCH] wip (#206)

---
 openapi.yaml | 397 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 323 insertions(+), 74 deletions(-)

diff --git a/openapi.yaml b/openapi.yaml
index aa2f96a4..a58331cf 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -32,7 +32,7 @@ tags:
   - name: Models
     description: List and describe the various models available in the API.
   - name: Moderations
-    description: Given a input text, outputs if the model classifies it as violating OpenAI's content policy.
+    description: Given a input text, outputs if the model classifies it as potentially harmful.
 paths:
   # Note: When adding an endpoint, make sure you also add it in the `groups` section, in the end of this file,
   # under the appropriate group
@@ -115,7 +115,7 @@ paths:
                 "id": "chatcmpl-123",
                 "object": "chat.completion",
                 "created": 1677652288,
-                "model": "gpt-3.5-turbo-0613",
+                "model": "gpt-3.5-turbo-0125",
                 "system_fingerprint": "fp_44709d6fcb",
                 "choices": [{
                   "index": 0,
@@ -212,7 +212,7 @@ paths:
                 "id": "chatcmpl-123",
                 "object": "chat.completion",
                 "created": 1677652288,
-                "model": "gpt-3.5-turbo-0613",
+                "model": "gpt-3.5-turbo-0125",
                 "system_fingerprint": "fp_44709d6fcb",
                 "choices": [{
                   "index": 0,
@@ -287,19 +287,13 @@ paths:
 
                 main();
             response: &chat_completion_chunk_example |
-              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
+              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0125", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
 
-              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{"content":"Hello"},"logprobs":null,"finish_reason":null}]}
-
-              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{"content":"!"},"logprobs":null,"finish_reason":null}]}
+              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0125", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{"content":"Hello"},"logprobs":null,"finish_reason":null}]}
 
               ....
 
-              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{"content":" today"},"logprobs":null,"finish_reason":null}]}
-
-              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{"content":"?"},"logprobs":null,"finish_reason":null}]}
-
-              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0613", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}
+              {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-3.5-turbo-0125", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}
           - title: Functions
             request:
               curl: |
@@ -416,7 +410,7 @@ paths:
                 "id": "chatcmpl-abc123",
                 "object": "chat.completion",
                 "created": 1699896916,
-                "model": "gpt-3.5-turbo-0613",
+                "model": "gpt-3.5-turbo-0125",
                 "choices": [
                   {
                     "index": 0,
@@ -498,7 +492,7 @@ paths:
                 "id": "chatcmpl-123",
                 "object": "chat.completion",
                 "created": 1702685778,
-                "model": "gpt-3.5-turbo-0613",
+                "model": "gpt-3.5-turbo-0125",
                 "choices": [
                   {
                     "index": 0,
@@ -1201,47 +1195,174 @@ paths:
           content:
             application/json:
               schema:
-                $ref: "#/components/schemas/CreateTranscriptionResponse"
+                oneOf:
+                - $ref: "#/components/schemas/CreateTranscriptionResponseJson"
+                - $ref: "#/components/schemas/CreateTranscriptionResponseVerboseJson"
       x-oaiMeta:
         name: Create transcription
         group: audio
-        returns: The transcribed text.
+        returns: The [transcription object](/docs/api-reference/audio/json-object) or a [verbose transcription object](/docs/api-reference/audio/verbose-json-object).  
         examples:
-          request:
-            curl: |
-              curl https://api.openai.com/v1/audio/transcriptions \
-                -H "Authorization: Bearer $OPENAI_API_KEY" \
-                -H "Content-Type: multipart/form-data" \
-                -F file="@/path/to/file/audio.mp3" \
-                -F model="whisper-1"
-            python: |
-              from openai import OpenAI
-              client = OpenAI()
+          - title: Default
+            request:
+              curl: |
+                curl https://api.openai.com/v1/audio/transcriptions \
+                  -H "Authorization: Bearer $OPENAI_API_KEY" \
+                  -H "Content-Type: multipart/form-data" \
+                  -F file="@/path/to/file/audio.mp3" \
+                  -F model="whisper-1"
+              python: |
+                from openai import OpenAI
+                client = OpenAI()
 
-              audio_file = open("speech.mp3", "rb")
-              transcript = client.audio.transcriptions.create(
-                model="whisper-1",
-                file=audio_file
-              )
-            node: |
-              import fs from "fs";
-              import OpenAI from "openai";
+                audio_file = open("speech.mp3", "rb")
+                transcript = client.audio.transcriptions.create(
+                  model="whisper-1",
+                  file=audio_file
+                )
+              node: |
+                import fs from "fs";
+                import OpenAI from "openai";
 
-              const openai = new OpenAI();
+                const openai = new OpenAI();
 
-              async function main() {
-                const transcription = await openai.audio.transcriptions.create({
-                  file: fs.createReadStream("audio.mp3"),
-                  model: "whisper-1",
-                });
+                async function main() {
+                  const transcription = await openai.audio.transcriptions.create({
+                    file: fs.createReadStream("audio.mp3"),
+                    model: "whisper-1",
+                  });
 
-                console.log(transcription.text);
+                  console.log(transcription.text);
+                }
+                main();
+            response: &basic_transcription_response_example |
+              {
+                "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger. This is a place where you can get to do that."
+              }
+          - title: Word timestamps
+            request:
+              curl: |
+                curl https://api.openai.com/v1/audio/transcriptions \
+                  -H "Authorization: Bearer $OPENAI_API_KEY" \
+                  -H "Content-Type: multipart/form-data" \
+                  -F file="@/path/to/file/audio.mp3" \
+                  -F "timestamp_granularities[]=word" \
+                  -F model="whisper-1" \
+                  -F response_format="verbose_json"
+              python: |
+                from openai import OpenAI
+                client = OpenAI()
+
+                audio_file = open("speech.mp3", "rb")
+                transcript = client.audio.transcriptions.create(
+                  file=audio_file,
+                  model="whisper-1",
+                  response_format="verbose_json",
+                  timestamp_granularities=["word"]
+                )
+                
+                print(transcript.words)
+              node: |
+                import fs from "fs";
+                import OpenAI from "openai";
+
+                const openai = new OpenAI();
+
+                async function main() {
+                  const transcription = await openai.audio.transcriptions.create({
+                    file: fs.createReadStream("audio.mp3"),
+                    model: "whisper-1",
+                    response_format: "verbose_json",
+                    timestamp_granularities: ["word"]
+                  });
+
+                  console.log(transcription.text);
+                }
+                main();
+            response: |
+              {
+                "task": "transcribe",
+                "language": "english",
+                "duration": 8.470000267028809,
+                "text": "The beach was a popular spot on a hot summer day. People were swimming in the ocean, building sandcastles, and playing beach volleyball.",
+                "words": [
+                  {
+                    "word": "The",
+                    "start": 0.0,
+                    "end": 0.23999999463558197
+                  },
+                  ...
+                  {
+                    "word": "volleyball",
+                    "start": 7.400000095367432,
+                    "end": 7.900000095367432
+                  }
+                ]
+              }
+          - title: Segment timestamps
+            request:
+              curl: |
+                curl https://api.openai.com/v1/audio/transcriptions \
+                  -H "Authorization: Bearer $OPENAI_API_KEY" \
+                  -H "Content-Type: multipart/form-data" \
+                  -F file="@/path/to/file/audio.mp3" \
+                  -F "timestamp_granularities[]=segment" \
+                  -F model="whisper-1" \
+                  -F response_format="verbose_json"
+              python: |
+                from openai import OpenAI
+                client = OpenAI()
+
+                audio_file = open("speech.mp3", "rb")
+                transcript = client.audio.transcriptions.create(
+                  file=audio_file,
+                  model="whisper-1",
+                  response_format="verbose_json",
+                  timestamp_granularities=["segment"]
+                )
+                
+                print(transcript.words)
+              node: |
+                import fs from "fs";
+                import OpenAI from "openai";
+
+                const openai = new OpenAI();
+
+                async function main() {
+                  const transcription = await openai.audio.transcriptions.create({
+                    file: fs.createReadStream("audio.mp3"),
+                    model: "whisper-1",
+                    response_format: "verbose_json",
+                    timestamp_granularities: ["segment"]
+                  });
+
+                  console.log(transcription.text);
+                }
+                main();
+            response: &verbose_transcription_response_example |
+              {
+                "task": "transcribe",
+                "language": "english",
+                "duration": 8.470000267028809,
+                "text": "The beach was a popular spot on a hot summer day. People were swimming in the ocean, building sandcastles, and playing beach volleyball.",
+                "segments": [
+                  {
+                    "id": 0,
+                    "seek": 0,
+                    "start": 0.0,
+                    "end": 3.319999933242798,
+                    "text": " The beach was a popular spot on a hot summer day.",
+                    "tokens": [
+                      50364, 440, 7534, 390, 257, 3743, 4008, 322, 257, 2368, 4266, 786, 13, 50530
+                    ],
+                    "temperature": 0.0,
+                    "avg_logprob": -0.2860786020755768,
+                    "compression_ratio": 1.2363636493682861,
+                    "no_speech_prob": 0.00985979475080967
+                  },
+                  ...
+                ]
               }
-              main();
-          response: |
-            {
-              "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger. This is a place where you can get to do that."
-            }
   /audio/translations:
     post:
       operationId: createTranslation
@@ -1260,7 +1381,9 @@ paths:
           content:
             application/json:
               schema:
-                $ref: "#/components/schemas/CreateTranslationResponse"
+                oneOf:
+                  - $ref: "#/components/schemas/CreateTranslationResponseJson"
+                  - $ref: "#/components/schemas/CreateTranslationResponseVerboseJson"
       x-oaiMeta:
         name: Create translation
         group: audio
@@ -1658,7 +1781,7 @@ paths:
               {
                 "object": "fine_tuning.job",
                 "id": "ftjob-abc123",
-                "model": "gpt-3.5-turbo-0613",
+                "model": "gpt-3.5-turbo-0125",
                 "created_at": 1614807352,
                 "fine_tuned_model": null,
                 "organization_id": "org-123",
@@ -1711,7 +1834,7 @@ paths:
               {
                 "object": "fine_tuning.job",
                 "id": "ftjob-abc123",
-                "model": "gpt-3.5-turbo-0613",
+                "model": "gpt-3.5-turbo-0125",
                 "created_at": 1614807352,
                 "fine_tuned_model": null,
                 "organization_id": "org-123",
@@ -1760,7 +1883,7 @@ paths:
               {
                 "object": "fine_tuning.job",
                 "id": "ftjob-abc123",
-                "model": "gpt-3.5-turbo-0613",
+                "model": "gpt-3.5-turbo-0125",
                 "created_at": 1614807352,
                 "fine_tuned_model": null,
                 "organization_id": "org-123",
@@ -2056,7 +2179,7 @@ paths:
             {
               "object": "fine_tuning.job",
               "id": "ftjob-abc123",
-              "model": "gpt-3.5-turbo-0613",
+              "model": "gpt-3.5-turbo-0125",
               "created_at": 1689376978,
               "fine_tuned_model": null,
               "organization_id": "org-123",
@@ -2247,7 +2370,7 @@ paths:
       operationId: createModeration
       tags:
         - Moderations
-      summary: Classifies if text violates OpenAI's Content Policy
+      summary: Classifies if text is potentially harmful.
       requestBody:
         required: true
         content:
@@ -2278,7 +2401,8 @@ paths:
               from openai import OpenAI
               client = OpenAI()
 
-              client.moderations.create(input="I want to kill them.")
+              moderation = client.moderations.create(input="I want to kill them.")
+              print(moderation)
             node.js: |
               import OpenAI from "openai";
 
@@ -5834,10 +5958,10 @@ components:
           default: false
           nullable: true
         top_logprobs:
-          description: An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to `true` if this parameter is used.
+          description: An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to `true` if this parameter is used.
           type: integer
           minimum: 0
-          maximum: 5
+          maximum: 20
           nullable: true
         max_tokens:
           description: |
@@ -6112,7 +6236,7 @@ components:
           description: The token.
           type: string
         logprob: &chat_completion_response_logprobs_token_logprob
-          description: The log probability of this token.
+          description: The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value `-9999.0` is used to signify that the token is very unlikely.
           type: number
         bytes: &chat_completion_response_logprobs_bytes
           description: A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be `null` if there is no bytes representation for the token.
@@ -6262,7 +6386,7 @@ components:
           default: "url"
           example: "url"
           nullable: true
-          description: The format in which the generated images are returned. Must be one of `url` or `b64_json`.
+          description: The format in which the generated images are returned. Must be one of `url` or `b64_json`. URLs are only valid for 60 minutes after the image has been generated.
         size: &images_size
           type: string
           enum: ["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"]
@@ -6416,7 +6540,7 @@ components:
 
     CreateModerationResponse:
       type: object
-      description: Represents policy compliance report by OpenAI's content moderation model against a given input.
+      description: Represents if a given text input is potentially harmful.
       properties:
         id:
           type: string
@@ -6432,7 +6556,7 @@ components:
             properties:
               flagged:
                 type: boolean
-                description: Whether the content violates [OpenAI's usage policies](/policies/usage-policies).
+                description: Whether any of the below categories are flagged.
               categories:
                 type: object
                 description: A list of the categories, and whether they are flagged or not.
@@ -6809,7 +6933,7 @@ components:
           format: binary
         model:
           description: |
-            ID of the model to use. Only `whisper-1` is currently available.
+            ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
           example: whisper-1
           anyOf:
             - type: string
@@ -6842,7 +6966,7 @@ components:
           default: 0
         timestamp_granularities[]:
           description: |
-            The timestamp granularities to populate for this transcription. Any of these options: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
+            The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
           type: array
           items:
             type: string
@@ -6855,13 +6979,117 @@ components:
         - model
 
     # Note: This does not currently support the non-default response format types.
-    CreateTranscriptionResponse:
+    CreateTranscriptionResponseJson:
       type: object
+      description: Represents a transcription response returned by model, based on the provided input.
       properties:
         text:
           type: string
+          description: The transcribed text.
+      required:
+        - text
+      x-oaiMeta:
+        name: The transcription object
+        group: audio
+        example: *basic_transcription_response_example
+
+    TranscriptionSegment:
+      type: object
+      properties:
+        id:
+          type: integer
+          description: Unique identifier of the segment.
+        seek:
+          type: integer
+          description: Seek offset of the segment.
+        start:
+          type: number
+          format: float
+          description: Start time of the segment in seconds.
+        end:
+          type: number
+          format: float
+          description: End time of the segment in seconds.
+        text:
+          type: string
+          description: Text content of the segment.
+        tokens:
+          type: array
+          items:
+            type: integer
+          description: Array of token IDs for the text content.
+        temperature:
+          type: number
+          format: float
+          description: Temperature parameter used for generating the segment.
+        avg_logprob:
+          type: number
+          format: float
+          description: Average logprob of the segment. If the value is lower than -1, consider the logprobs failed.
+        compression_ratio:
+          type: number
+          format: float
+          description: Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed.
+        no_speech_prob:
+          type: number
+          format: float
+          description: Probability of no speech in the segment. If the value is higher than 1.0 and the `avg_logprob` is below -1, consider this segment silent.
       required:
+        - id
+        - seek
+        - start
+        - end
         - text
+        - tokens
+        - temperature
+        - avg_logprob
+        - compression_ratio
+        - no_speech_prob
+
+    TranscriptionWord:
+      type: object
+      properties:
+        word:
+          type: string
+          description: The text content of the word.
+        start:
+          type: number
+          format: float
+          description: Start time of the word in seconds.
+        end:
+          type: number
+          format: float
+          description: End time of the word in seconds.
+      required: [word, start, end]
+
+    CreateTranscriptionResponseVerboseJson:
+      type: object
+      description: Represents a verbose json transcription response returned by model, based on the provided input.
+      properties:
+        language:
+          type: string
+          description: The language of the input audio.
+        duration:
+          type: string
+          description: The duration of the input audio.
+        text:
+          type: string
+          description: The transcribed text.
+        words:
+          type: array
+          description: Extracted words and their corresponding timestamps.
+          items:
+            $ref: '#/components/schemas/TranscriptionWord'
+        segments:
+          type: array
+          description: Segments of the transcribed text and their corresponding details.
+          items:
+            $ref: '#/components/schemas/TranscriptionSegment'
+      required: [language, duration, text]
+      x-oaiMeta:
+        name: The transcription object
+        group: audio
+        example: *verbose_transcription_response_example
 
     CreateTranslationRequest:
       type: object
@@ -6875,7 +7103,7 @@ components:
           format: binary
         model:
           description: |
-            ID of the model to use. Only `whisper-1` is currently available.
+            ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
           example: whisper-1
           anyOf:
             - type: string
@@ -6901,7 +7129,7 @@ components:
         - model
 
     # Note: This does not currently support the non-default response format types.
-    CreateTranslationResponse:
+    CreateTranslationResponseJson:
       type: object
       properties:
         text:
@@ -6909,6 +7137,25 @@ components:
       required:
         - text
 
+    CreateTranslationResponseVerboseJson:
+      type: object
+      properties:
+        language:
+          type: string
+          description: The language of the output translation (always `english`).
+        duration:
+          type: string
+          description: The duration of the input audio.
+        text:
+          type: string
+          description: The translated text.
+        segments:
+          type: array
+          description: Segments of the translated text and their corresponding details.
+          items:
+            $ref: '#/components/schemas/TranscriptionSegment'
+      required: [language, duration, text]
+
     CreateSpeechRequest:
       type: object
       additionalProperties: false
@@ -6930,14 +7177,10 @@ components:
           type: string
           enum: ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
         response_format:
-          description: |-
-            The format to return audio in. 
-            Supported formats are `mp3`, `opus`, `aac`, `flac`, `pcm`, and `wav`. 
-            
-            The `pcm` audio format, similar to `wav` but without a header, utilizes a 24kHz sample rate, mono channel, and 16-bit depth in signed little-endian format.
+          description: "The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`."
           default: "mp3"
           type: string
-          enum: ["mp3", "opus", "aac", "flac", "pcm", "wav"]
+          enum: ["mp3", "opus", "aac", "flac", "wav", "pcm"]
         speed:
           description: "The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default."
           type: number
@@ -7594,8 +7837,8 @@ components:
           properties:
             code:
               type: string
-              description: One of `server_error` or `rate_limit_exceeded`.
-              enum: ["server_error", "rate_limit_exceeded"]
+              description: One of `server_error`, `rate_limit_exceeded`, or `invalid_prompt`.
+              enum: ["server_error", "rate_limit_exceeded", "invalid_prompt"]
             message:
               type: string
               description: A human-readable description of the error.
@@ -8725,6 +8968,12 @@ x-oaiMeta:
         - type: endpoint
           key: createTranslation
           path: createTranslation
+        - type: object
+          key: CreateTranscriptionResponseJson
+          path: json-object
+        - type: object
+          key: CreateTranscriptionResponseVerboseJson
+          path: verbose-json-object
     - id: chat
       title: Chat
       description: |
@@ -8844,7 +9093,7 @@ x-oaiMeta:
     - id: moderations
       title: Moderations
       description: |
-        Given a input text, outputs if the model classifies it as violating OpenAI's content policy.
+        Given some input text, outputs if the model classifies it as potentially harmful across several categories.
 
         Related guide: [Moderations](/docs/guides/moderation)
       sections:
@@ -8995,7 +9244,7 @@ x-oaiMeta:
       title: Completions
       legacy: true
       description: |
-        Given a prompt, the model will return one or more predicted completions along with the probabilities of alternative tokens at each position. Most developer should use our [Chat Completions API](/docs/guides/text-generation/text-generation-models) to leverage our best and newest models. Most models that support the legacy Completions endpoint [will be shut off on January 4th, 2024](/docs/deprecations/2023-07-06-gpt-and-embeddings).
+        Given a prompt, the model will return one or more predicted completions along with the probabilities of alternative tokens at each position. Most developer should use our [Chat Completions API](/docs/guides/text-generation/text-generation-models) to leverage our best and newest models.
       sections:
         - type: endpoint
           key: createCompletion