Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Adds token and request-based rate limiting with an example #29

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions apps/docs/docs/modules/model-providers.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,14 @@ await openai.stream(
}
);
```

### Rate limiting

Some model providers (eg. `OpenAI`) have rate limits on the number of requests and tokens per minute.
By default, Promptable will handle rate limiting for you assuming you are using `text-davinci` as your model.

If you are using a different model or you want to set your own rate limits, you can do so:
```ts
// Codex model rate limits
new OpenAI(apiKey, { rateLimitConfig: { requestsPerMinute: 20, tokensPerMinute: 40000 } });
```
4 changes: 4 additions & 0 deletions examples/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ import tracingWebPrompt from "./tracing-web-prompt"
import chainSimple from "./chain-simple";
import chainMemory from "./chain-memory";

import rateLimit from "./rate-limit-example";

// Add examples here!

const examples = {
Expand Down Expand Up @@ -83,6 +85,8 @@ const examples = {

"chain-simple": chainSimple,
"chain-memory": chainMemory,

"rate-limit": rateLimit,
};

const isExample = (arg: string): arg is keyof typeof examples =>
Expand Down
45 changes: 45 additions & 0 deletions examples/src/rate-limit-example.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/**
This example shows the built-in rate limiting functionality of Promptable.

We'll generate 25 requests and try to send them in parallel against the OpenAI API.
Since we're using the codex model, we have a rate limit of 20 requests per minute.

Without rate limiting, we'd expect some of the requests to fail.
With rate limiting, we expect all requests to succeed.
**/
import dotenv from "dotenv";
dotenv.config();
import { OpenAI } from "promptable";

const apiKey = process.env.OPENAI_API_KEY || "";

const attemptRequests = async (openai: OpenAI) => {
const text = "this is a test";
let responsesPromises = [];
for (let i = 0; i < 25; i++) {
responsesPromises.push(openai.generate(text, { model: "code-davinci-002" }));
}
const startTime = performance.now();
let responses = await Promise.all(responsesPromises);
const endTime = performance.now();
let numFailed = responses.filter(r => r === "failed").length;
return [numFailed, responses.length, endTime - startTime];
}

const run = async (_args: string[]) => {
// Setting rateLimitConfig to null disables rate limiting
const openaiNoLimit = new OpenAI(apiKey, { rateLimitConfig: null });
let [numFailed, total, time] = await attemptRequests(openaiNoLimit);
console.log(`Without rate limiting, ${numFailed}/${total} requests failed. Total time: ${time.toFixed(0)} ms`);
console.log("Waiting 180 seconds for rate limit to reset...");
// Sleep for 180 seconds to allow the rate limit to reset
await new Promise(r => setTimeout(r, 180000));
/* Since the default rateLimitConfig is set for text-davinci-003, we explicitly set the
rateLimitConfig to the codex model's rate limit with a lot of wiggle room (6 instead of 20)
openai's rate limiter for codex is a little wonky so we use low rate limit */
const openaiLimit = new OpenAI(apiKey, { rateLimitConfig: { requestsPerMinute: 6, tokensPerMinute: 20000 } });
[numFailed, total, time] = await attemptRequests(openaiLimit);
console.log(`With rate limiting, ${numFailed}/${total} requests failed. Total time: ${time.toFixed(0)} ms`);
};

export default run;
1 change: 1 addition & 0 deletions packages/promptable/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"chalk": "^4.1.2",
"csv-parse": "^5.3.4",
"gpt3-tokenizer": "^1.1.4",
"limiter": "^2.1.0",
"openai": "^3.1.0",
"typescript": "latest",
"uuid": "^9.0.0",
Expand Down
2 changes: 1 addition & 1 deletion packages/promptable/src/chains/LLMChain.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export class LLMChain<
constructor(
public prompt: Prompt<T, P>,
public provider: CompletionsModelProvider
) {}
) { }

protected async _run(variables: Record<T, string>) {
// TODO: fix trace so that the anonymous function isn't needed
Expand Down
77 changes: 64 additions & 13 deletions packages/promptable/src/providers/OpenAI.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { RateLimiter } from "limiter";
import {
CompletionsModelProvider,
EmbeddingsModelProvider,
Expand All @@ -9,7 +10,7 @@ import { unescapeStopTokens } from "@utils/unescape-stop-tokens";
import { Document } from "src";
import GPT3Tokenizer from "gpt3-tokenizer";

class OpenAIConfiguration extends Configuration {}
class OpenAIConfiguration extends Configuration { }

type GenerateCompletionOptions = {
/**
Expand Down Expand Up @@ -104,34 +105,67 @@ type GenerateCompletionOptions = {
user?: string;
};


type RateLimitConfig = { requestsPerMinute: number, tokensPerMinute: number } | null;

export type OpenAIProviderConfig = {
rateLimitConfig?: RateLimitConfig
};

function normalizeRequestLimits(perMinute: number) {
let perMillisecond = perMinute / 60000;
let interval = 1; // ms
if (perMillisecond < 1) {
interval = 1 / perMillisecond;
perMillisecond = 1;
}
return [perMillisecond, interval];
}

export class OpenAI
extends ModelProvider
implements CompletionsModelProvider, EmbeddingsModelProvider
{
implements CompletionsModelProvider, EmbeddingsModelProvider {
apiKey: string;
config: OpenAIConfiguration;
openAIConfig: OpenAIConfiguration;
api: OpenAIApi;
completionsConfig = DEFAULT_COMPLETION_OPTIONS;
embeddingsConfig: OpenAIEmbeddingsConfig = DEFAULT_OPENAI_EMBEDDINGS_CONFIG;
tokenizer: OpenAITokenizer = new OpenAITokenizer();
tokenRateLimiter: RateLimiter | null = null;
requestRateLimiter: RateLimiter | null = null;


constructor(apiKey: string) {
constructor(apiKey: string, { rateLimitConfig }: OpenAIProviderConfig = {}) {
super(ModelProviderType.OpenAI);
this.apiKey = apiKey;
rateLimitConfig = rateLimitConfig === undefined ? OPENAI_DEFAULT_RATE_LIMITS : rateLimitConfig;
if (rateLimitConfig) {
const [requestsPerInterval, rpmInterval] = normalizeRequestLimits(rateLimitConfig.requestsPerMinute);
// NOTE: the token rate limiter is hard to test properly
this.tokenRateLimiter = new RateLimiter({ tokensPerInterval: rateLimitConfig.tokensPerMinute, interval: 'minute' });
this.requestRateLimiter = new RateLimiter({ tokensPerInterval: requestsPerInterval, interval: rpmInterval });
}

const config = new OpenAIConfiguration({
this.openAIConfig = new OpenAIConfiguration({
apiKey,
});

this.config = config;

this.api = new OpenAIApi(config);
this.api = new OpenAIApi(this.openAIConfig);
}

countTokens(text: string) {
return this.tokenizer.countTokens(text);
}

protected async enforceRateLimit(promptText: string, options: { max_tokens?: number | null }) {
if (this.tokenRateLimiter) {
await this.tokenRateLimiter.removeTokens(this.countTokens(promptText) + (options.max_tokens || 0));
}
if (this.requestRateLimiter) {
await this.requestRateLimiter.removeTokens(1);
}
}

async generate(
promptText: string,
options: GenerateCompletionOptions = DEFAULT_COMPLETION_OPTIONS
Expand All @@ -141,15 +175,24 @@ export class OpenAI
options.stop = unescapeStopTokens(options.stop);
}

await this.enforceRateLimit(promptText, options);

const res = await this.api.createCompletion({
prompt: promptText,
...options,
model: options.model || DEFAULT_COMPLETION_OPTIONS.model,
});

return res.data.choices[0]?.text || "";
} catch (e) {
console.log(e);
// @ts-expect-error
if (e.response) {
// @ts-expect-error
console.error(`Status code: ${e.response.status}. Data: ${JSON.stringify(e.response.data, null, 2)}`);

} else {
// @ts-expect-error
console.error(e.message);
}
}
return "failed";
}
Expand Down Expand Up @@ -216,6 +259,7 @@ export class OpenAI
text: string,
options: Omit<CreateEmbeddingRequest, "input">
) => {
await this.enforceRateLimit(text, {});
const result = await this.api.createEmbedding({
...options,
input: text.replace(/\n/g, " "),
Expand All @@ -229,11 +273,13 @@ export class OpenAI
options: Omit<CreateEmbeddingRequest, "input">
) => {
const batchResults = await Promise.all(
texts.map((text) =>
this.api.createEmbedding({
texts.map(async (text) => {
await this.enforceRateLimit(text, {});
return await this.api.createEmbedding({
...options,
input: text.replace(/\n/g, " "),
})
}
)
);

Expand Down Expand Up @@ -273,6 +319,11 @@ export const OPENAI_MODEL_SETTINGS = {
},
};

/* Taken from: https://platform.openai.com/docs/guides/rate-limits/overview
By default we'll use the rate limits for pay-as-you-go (after 48 hours) users for text-davinci-003.
*/
export const OPENAI_DEFAULT_RATE_LIMITS = { requestsPerMinute: 3000, tokensPerMinute: 250000 };

interface OpenAIEmbeddingsConfig {
model: string;
}
Expand Down
Loading