cfortuner · hanrelan · Feb 17, 2023
diff --git a/apps/docs/docs/modules/model-providers.md b/apps/docs/docs/modules/model-providers.md
@@ -55,3 +55,14 @@ await openai.stream(
   }
 );
 ```
+
+### Rate limiting
+
+Some model providers (eg. `OpenAI`) have rate limits on the number of requests and tokens per minute.
+By default, Promptable will handle rate limiting for you assuming you are using `text-davinci` as your model.
+
+If you are using a different model or you want to set your own rate limits, you can do so:
+```ts
+// Codex model rate limits
+new OpenAI(apiKey, { rateLimitConfig: { requestsPerMinute: 20, tokensPerMinute: 40000 } });
+```
diff --git a/examples/src/index.ts b/examples/src/index.ts
@@ -40,6 +40,8 @@ import tracingWebPrompt from "./tracing-web-prompt"
 import chainSimple from "./chain-simple";
 import chainMemory from "./chain-memory";
 
+import rateLimit from "./rate-limit-example";
+
 // Add examples here!
 
 const examples = {
@@ -83,6 +85,8 @@ const examples = {
 
   "chain-simple": chainSimple,
   "chain-memory": chainMemory,
+
+  "rate-limit": rateLimit,
 };
 
 const isExample = (arg: string): arg is keyof typeof examples =>

diff --git a/examples/src/rate-limit-example.ts b/examples/src/rate-limit-example.ts
@@ -0,0 +1,45 @@
+/** 
+This example shows the built-in rate limiting functionality of Promptable.
+
+We'll generate 25 requests and try to send them in parallel against the OpenAI API.
+Since we're using the codex model, we have a rate limit of 20 requests per minute.
+
+Without rate limiting, we'd expect some of the requests to fail.
+With rate limiting, we expect all requests to succeed.
+**/
+import dotenv from "dotenv";
+dotenv.config();
+import { OpenAI } from "promptable";
+
+const apiKey = process.env.OPENAI_API_KEY || "";
+
+const attemptRequests = async (openai: OpenAI) => {
+    const text = "this is a test";
+    let responsesPromises = [];
+    for (let i = 0; i < 25; i++) {
+        responsesPromises.push(openai.generate(text, { model: "code-davinci-002" }));
+    }
+    const startTime = performance.now();
+    let responses = await Promise.all(responsesPromises);
+    const endTime = performance.now();
+    let numFailed = responses.filter(r => r === "failed").length;
+    return [numFailed, responses.length, endTime - startTime];
+}
+
+const run = async (_args: string[]) => {
+    // Setting rateLimitConfig to null disables rate limiting
+    const openaiNoLimit = new OpenAI(apiKey, { rateLimitConfig: null });
+    let [numFailed, total, time] = await attemptRequests(openaiNoLimit);
+    console.log(`Without rate limiting, ${numFailed}/${total} requests failed. Total time: ${time.toFixed(0)} ms`);
+    console.log("Waiting 180 seconds for rate limit to reset...");
+    // Sleep for 180 seconds to allow the rate limit to reset
+    await new Promise(r => setTimeout(r, 180000));
+    /* Since the default rateLimitConfig is set for text-davinci-003, we explicitly set the 
+       rateLimitConfig to the codex model's rate limit with a lot of wiggle room (6 instead of 20)
+       openai's rate limiter for codex is a little wonky so we use low rate limit */
+    const openaiLimit = new OpenAI(apiKey, { rateLimitConfig: { requestsPerMinute: 6, tokensPerMinute: 20000 } });
+    [numFailed, total, time] = await attemptRequests(openaiLimit);
+    console.log(`With rate limiting, ${numFailed}/${total} requests failed. Total time: ${time.toFixed(0)} ms`);
+};
+
+export default run;
diff --git a/packages/promptable/package.json b/packages/promptable/package.json
@@ -26,6 +26,7 @@
         "chalk": "^4.1.2",
         "csv-parse": "^5.3.4",
         "gpt3-tokenizer": "^1.1.4",
+        "limiter": "^2.1.0",
         "openai": "^3.1.0",
         "typescript": "latest",
         "uuid": "^9.0.0",

diff --git a/packages/promptable/src/chains/LLMChain.ts b/packages/promptable/src/chains/LLMChain.ts
@@ -10,7 +10,7 @@ export class LLMChain<
   constructor(
     public prompt: Prompt<T, P>,
     public provider: CompletionsModelProvider
-  ) {}
+  ) { }
 
   protected async _run(variables: Record<T, string>) {
     // TODO: fix trace so that the anonymous function isn't needed

diff --git a/packages/promptable/src/providers/OpenAI.ts b/packages/promptable/src/providers/OpenAI.ts
@@ -1,3 +1,4 @@
+import { RateLimiter } from "limiter";
 import {
   CompletionsModelProvider,
   EmbeddingsModelProvider,
@@ -9,7 +10,7 @@ import { unescapeStopTokens } from "@utils/unescape-stop-tokens";
 import { Document } from "src";
 import GPT3Tokenizer from "gpt3-tokenizer";
 
-class OpenAIConfiguration extends Configuration {}
+class OpenAIConfiguration extends Configuration { }
 
 type GenerateCompletionOptions = {
   /**
@@ -104,34 +105,67 @@ type GenerateCompletionOptions = {
   user?: string;
 };
 
+
+type RateLimitConfig = { requestsPerMinute: number, tokensPerMinute: number } | null;
+
+export type OpenAIProviderConfig = {
+  rateLimitConfig?: RateLimitConfig
+};
+
+function normalizeRequestLimits(perMinute: number) {
+  let perMillisecond = perMinute / 60000;
+  let interval = 1; // ms
+  if (perMillisecond < 1) {
+    interval = 1 / perMillisecond;
+    perMillisecond = 1;
+  }
+  return [perMillisecond, interval];
+}
+
 export class OpenAI
   extends ModelProvider
-  implements CompletionsModelProvider, EmbeddingsModelProvider
-{
+  implements CompletionsModelProvider, EmbeddingsModelProvider {
   apiKey: string;
-  config: OpenAIConfiguration;
+  openAIConfig: OpenAIConfiguration;
   api: OpenAIApi;
   completionsConfig = DEFAULT_COMPLETION_OPTIONS;
   embeddingsConfig: OpenAIEmbeddingsConfig = DEFAULT_OPENAI_EMBEDDINGS_CONFIG;
   tokenizer: OpenAITokenizer = new OpenAITokenizer();
+  tokenRateLimiter: RateLimiter | null = null;
+  requestRateLimiter: RateLimiter | null = null;
+
 
-  constructor(apiKey: string) {
+  constructor(apiKey: string, { rateLimitConfig }: OpenAIProviderConfig = {}) {
     super(ModelProviderType.OpenAI);
     this.apiKey = apiKey;
+    rateLimitConfig = rateLimitConfig === undefined ? OPENAI_DEFAULT_RATE_LIMITS : rateLimitConfig;
+    if (rateLimitConfig) {
+      const [requestsPerInterval, rpmInterval] = normalizeRequestLimits(rateLimitConfig.requestsPerMinute);
+      // NOTE: the token rate limiter is hard to test properly
+      this.tokenRateLimiter = new RateLimiter({ tokensPerInterval: rateLimitConfig.tokensPerMinute, interval: 'minute' });
+      this.requestRateLimiter = new RateLimiter({ tokensPerInterval: requestsPerInterval, interval: rpmInterval });
+    }
 
-    const config = new OpenAIConfiguration({
+    this.openAIConfig = new OpenAIConfiguration({
       apiKey,
     });
 
-    this.config = config;
-
-    this.api = new OpenAIApi(config);
+    this.api = new OpenAIApi(this.openAIConfig);
   }
 
   countTokens(text: string) {
     return this.tokenizer.countTokens(text);
   }
 
+  protected async enforceRateLimit(promptText: string, options: { max_tokens?: number | null }) {
+    if (this.tokenRateLimiter) {
+      await this.tokenRateLimiter.removeTokens(this.countTokens(promptText) + (options.max_tokens || 0));
+    }
+    if (this.requestRateLimiter) {
+      await this.requestRateLimiter.removeTokens(1);
+    }
+  }
+
   async generate(
     promptText: string,
     options: GenerateCompletionOptions = DEFAULT_COMPLETION_OPTIONS
@@ -141,15 +175,24 @@ export class OpenAI
         options.stop = unescapeStopTokens(options.stop);
       }
 
+      await this.enforceRateLimit(promptText, options);
+
       const res = await this.api.createCompletion({
         prompt: promptText,
         ...options,
         model: options.model || DEFAULT_COMPLETION_OPTIONS.model,
       });
-
       return res.data.choices[0]?.text || "";
     } catch (e) {
-      console.log(e);
+      // @ts-expect-error
+      if (e.response) {
+        // @ts-expect-error
+        console.error(`Status code: ${e.response.status}. Data: ${JSON.stringify(e.response.data, null, 2)}`);
+
+      } else {
+        // @ts-expect-error
+        console.error(e.message);
+      }
     }
     return "failed";
   }
@@ -216,6 +259,7 @@ export class OpenAI
     text: string,
     options: Omit<CreateEmbeddingRequest, "input">
   ) => {
+    await this.enforceRateLimit(text, {});
     const result = await this.api.createEmbedding({
       ...options,
       input: text.replace(/\n/g, " "),
@@ -229,11 +273,13 @@ export class OpenAI
     options: Omit<CreateEmbeddingRequest, "input">
   ) => {
     const batchResults = await Promise.all(
-      texts.map((text) =>
-        this.api.createEmbedding({
+      texts.map(async (text) => {
+        await this.enforceRateLimit(text, {});
+        return await this.api.createEmbedding({
           ...options,
           input: text.replace(/\n/g, " "),
         })
+      }
       )
     );
 
@@ -273,6 +319,11 @@ export const OPENAI_MODEL_SETTINGS = {
   },
 };
 
+/* Taken from: https://platform.openai.com/docs/guides/rate-limits/overview
+By default we'll use the rate limits for pay-as-you-go (after 48 hours) users for text-davinci-003.
+*/
+export const OPENAI_DEFAULT_RATE_LIMITS = { requestsPerMinute: 3000, tokensPerMinute: 250000 };
+
 interface OpenAIEmbeddingsConfig {
   model: string;
 }