diff --git a/docs/assets/openapi.json b/docs/assets/openapi.json index 67d22858c..652c6d1b8 100644 --- a/docs/assets/openapi.json +++ b/docs/assets/openapi.json @@ -1 +1 @@ -{"openapi":"3.1.0","info":{"title":"♾️ Infinity - Embedding Inference Server","summary":"Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip. Infinity is developed under MIT License at https://github.com/michaelfeil/infinity.","contact":{"name":"Michael Feil, Raphael Wirth"},"license":{"name":"MIT License","identifier":"MIT"},"version":"0.0.77"},"paths":{"/health":{"get":{"summary":" Health","description":"health check endpoint\n\nReturns:\n dict(unix=float): dict with unix time stamp","operationId":"health","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"type":"number"},"type":"object","title":"Response Health"}}}}}}},"/":{"get":{"summary":"Redirect","operationId":"redirect__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/models":{"get":{"summary":" Models","description":"get models endpoint","operationId":"models","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIModelInfo"}}}}}}},"/embeddings":{"post":{"summary":" Embeddings","description":"Encode Embeddings. Supports with multimodal inputs. Aligned with OpenAI Embeddings API.\n\n## Running Text Embeddings\n```python\nimport requests, base64\nrequests.post(\"http://..:7997/embeddings\",\n json={\"model\":\"openai/clip-vit-base-patch32\",\"input\":[\"Two cute cats.\"]})\n```\n\n## Running Image Embeddings\n```python\nrequests.post(\"http://..:7997/embeddings\",\n json={\n \"model\": \"openai/clip-vit-base-patch32\",\n \"encoding_format\": \"base64\",\n \"input\": [\n \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n # can also be base64 encoded\n ],\n # set extra modality to image to process as image\n \"modality\": \"image\"\n)\n```\n\n## Running Audio Embeddings\n```python\nimport requests, base64\nurl = \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\"\n\ndef url_to_base64(url, modality = \"image\"):\n '''small helper to convert url to base64 without server requiring access to the url'''\n response = requests.get(url)\n response.raise_for_status()\n base64_encoded = base64.b64encode(response.content).decode('utf-8')\n mimetype = f\"{modality}/{url.split('.')[-1]}\"\n return f\"data:{mimetype};base64,{base64_encoded}\"\n\nrequests.post(\"http://localhost:7997/embeddings\",\n json={\n \"model\": \"laion/larger_clap_general\",\n \"encoding_format\": \"float\",\n \"input\": [\n url, url_to_base64(url, \"audio\")\n ],\n # set extra modality to audio to process as audio\n \"modality\": \"audio\"\n }\n)\n```\n\n## Running via OpenAI Client\n```python\nfrom openai import OpenAI # pip install openai==1.51.0\nclient = OpenAI(base_url=\"http://localhost:7997/\")\nclient.embeddings.create(\n model=\"laion/larger_clap_general\",\n input=[url_to_base64(url, \"audio\")],\n encoding_format=\"float\",\n extra_body={\n \"modality\": \"audio\"\n }\n)\n\nclient.embeddings.create(\n model=\"laion/larger_clap_general\",\n input=[\"the sound of a beep\", \"the sound of a cat\"],\n encoding_format=\"base64\", # base64: optional high performance setting\n extra_body={\n \"modality\": \"text\"\n }\n)\n```\n\n### Hint: Run all the above models on one server:\n```bash\ninfinity_emb v2 --model-id BAAI/bge-small-en-v1.5 --model-id openai/clip-vit-base-patch32 --model-id laion/larger_clap_general\n```","operationId":"embeddings","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/MultiModalOpenAIEmbedding"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/rerank":{"post":{"summary":" Rerank","description":"Rerank documents. Aligned with Cohere API (https://docs.cohere.com/reference/rerank)\n\n```python\nimport requests\nrequests.post(\"http://..:7997/rerank\",\n json={\n \"model\":\"mixedbread-ai/mxbai-rerank-xsmall-v1\",\n \"query\":\"Where is Munich?\",\n \"documents\":[\"Munich is in Germany.\", \"The sky is blue.\"]\n })\n```","operationId":"rerank","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RerankInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ReRankResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/classify":{"post":{"summary":" Classify","description":"Score or Classify Sentiments\n\n```python\nimport requests\nrequests.post(\"http://..:7997/classify\",\n json={\"model\":\"SamLowe/roberta-base-go_emotions\",\"input\":[\"I am not having a great day.\"]})\n```","operationId":"classify","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/embeddings_image":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `image`","description":"Encode Embeddings from Image files\n\nSupports URLs of Images and Base64-encoded Images\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_image\",\n json={\n \"model\":\"openai/clip-vit-base-patch32\",\n \"input\": [\n \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n \"data:image/png;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDIMAGE\"\n ]\n })\n```","operationId":"embeddings_image","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ImageEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/embeddings_audio":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `audio`","description":"Encode Embeddings from Audio files\n\nSupports URLs of Audios and Base64-encoded Audios\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_audio\",\n json={\n \"model\":\"laion/larger_clap_general\",\n \"input\": [\n \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\",\n \"data:audio/wav;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDAUDIO\"\n ]\n })\n```","operationId":"embeddings_audio","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/AudioEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/metrics":{"get":{"summary":"Metrics","description":"Endpoint that serves Prometheus metrics.","operationId":"metrics_metrics_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}}},"components":{"schemas":{"AudioEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"AudioEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ClassifyInput":{"properties":{"input":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false}},"type":"object","required":["input"],"title":"ClassifyInput"},"ClassifyResult":{"properties":{"object":{"type":"string","enum":["classify"],"const":"classify","title":"Object","default":"classify"},"data":{"items":{"items":{"$ref":"#/components/schemas/_ClassifyObject"},"type":"array"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"ClassifyResult","description":"Result of classification."},"EmbeddingEncodingFormat":{"type":"string","enum":["float","base64"],"title":"EmbeddingEncodingFormat"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ImageEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"ImageEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ModelInfo":{"properties":{"id":{"type":"string","title":"Id"},"stats":{"type":"object","title":"Stats"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"owned_by":{"type":"string","enum":["infinity"],"const":"infinity","title":"Owned By","default":"infinity"},"created":{"type":"integer","title":"Created"},"backend":{"type":"string","title":"Backend","default":""},"capabilities":{"items":{"type":"string"},"type":"array","uniqueItems":true,"title":"Capabilities","default":[]}},"type":"object","required":["id","stats"],"title":"ModelInfo"},"MultiModalOpenAIEmbedding":{"oneOf":[{"$ref":"#/components/schemas/_OpenAIEmbeddingInput_Text"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Audio"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Image"}],"title":"MultiModalOpenAIEmbedding"},"OpenAIEmbeddingInput_Audio":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["audio"],"const":"audio","title":"Modality","default":"audio"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Audio"},"OpenAIEmbeddingInput_Image":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["image"],"const":"image","title":"Modality","default":"image"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Image"},"OpenAIEmbeddingResult":{"properties":{"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"},"data":{"items":{"$ref":"#/components/schemas/_EmbeddingObject"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"OpenAIEmbeddingResult"},"OpenAIModelInfo":{"properties":{"data":{"items":{"$ref":"#/components/schemas/ModelInfo"},"type":"array","title":"Data"},"object":{"type":"string","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"OpenAIModelInfo"},"ReRankResult":{"properties":{"object":{"type":"string","enum":["rerank"],"const":"rerank","title":"Object","default":"rerank"},"results":{"items":{"$ref":"#/components/schemas/_ReRankObject"},"type":"array","title":"Results"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["results","model","usage"],"title":"ReRankResult","description":"Following the Cohere protocol for Rerankers."},"RerankInput":{"properties":{"query":{"type":"string","maxLength":122880,"title":"Query"},"documents":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Documents"},"return_documents":{"type":"boolean","title":"Return Documents","default":false},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false},"model":{"type":"string","title":"Model","default":"default/not-specified"},"top_n":{"anyOf":[{"type":"integer","exclusiveMinimum":0.0},{"type":"null"}],"title":"Top N"}},"type":"object","required":["query","documents"],"title":"RerankInput","description":"Input for reranking"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"_ClassifyObject":{"properties":{"score":{"type":"number","title":"Score"},"label":{"type":"string","title":"Label"}},"type":"object","required":["score","label"],"title":"_ClassifyObject"},"_EmbeddingObject":{"properties":{"object":{"type":"string","enum":["embedding"],"const":"embedding","title":"Object","default":"embedding"},"embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"string","format":"binary"},{"items":{"items":{"type":"number"},"type":"array"},"type":"array"}],"title":"Embedding"},"index":{"type":"integer","title":"Index"}},"type":"object","required":["embedding","index"],"title":"_EmbeddingObject"},"_OpenAIEmbeddingInput_Text":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1},{"type":"string","maxLength":122880}],"title":"Input"},"modality":{"type":"string","enum":["text"],"const":"text","title":"Modality","default":"text"}},"type":"object","required":["input"],"title":"_OpenAIEmbeddingInput_Text","description":"helper"},"_ReRankObject":{"properties":{"relevance_score":{"type":"number","title":"Relevance Score"},"index":{"type":"integer","title":"Index"},"document":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Document"}},"type":"object","required":["relevance_score","index"],"title":"_ReRankObject"},"_Usage":{"properties":{"prompt_tokens":{"type":"integer","title":"Prompt Tokens"},"total_tokens":{"type":"integer","title":"Total Tokens"}},"type":"object","required":["prompt_tokens","total_tokens"],"title":"_Usage"}}}} \ No newline at end of file +{"openapi":"3.1.0","info":{"title":"♾️ Infinity - Embedding Inference Server","summary":"Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip. Infinity is developed under MIT License at https://github.com/michaelfeil/infinity.","contact":{"name":"Michael Feil, Raphael Wirth"},"license":{"name":"MIT License","identifier":"MIT"},"version":"0.0.77"},"paths":{"/health":{"get":{"summary":" Health","description":"health check endpoint\n\nReturns:\n dict(unix=float): dict with unix time stamp","operationId":"health","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"type":"number"},"type":"object","title":"Response Health"}}}}}}},"/":{"get":{"summary":"Redirect","operationId":"redirect__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/models":{"get":{"summary":" Models","description":"get models endpoint","operationId":"models","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIModelInfo"}}}}}}},"/embeddings":{"post":{"summary":" Embeddings","description":"Encode Embeddings. Supports with multimodal inputs. Aligned with OpenAI Embeddings API.\n\n## Running Text Embeddings\n```python\nimport requests, base64\nrequests.post(\"http://..:7997/embeddings\",\n json={\"model\":\"openai/clip-vit-base-patch32\",\"input\":[\"Two cute cats.\"]})\n```\n\n## Running Image Embeddings\n```python\nrequests.post(\"http://..:7997/embeddings\",\n json={\n \"model\": \"openai/clip-vit-base-patch32\",\n \"encoding_format\": \"base64\",\n \"input\": [\n \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n # can also be base64 encoded\n ],\n # set extra modality to image to process as image\n \"modality\": \"image\"\n)\n```\n\n## Running Audio Embeddings\n```python\nimport requests, base64\nurl = \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\"\n\ndef url_to_base64(url, modality = \"image\"):\n '''small helper to convert url to base64 without server requiring access to the url'''\n response = requests.get(url)\n response.raise_for_status()\n base64_encoded = base64.b64encode(response.content).decode('utf-8')\n mimetype = f\"{modality}/{url.split('.')[-1]}\"\n return f\"data:{mimetype};base64,{base64_encoded}\"\n\nrequests.post(\"http://localhost:7997/embeddings\",\n json={\n \"model\": \"laion/larger_clap_general\",\n \"encoding_format\": \"float\",\n \"input\": [\n url, url_to_base64(url, \"audio\")\n ],\n # set extra modality to audio to process as audio\n \"modality\": \"audio\"\n }\n)\n```\n\n## Running via OpenAI Client\n```python\nfrom openai import OpenAI # pip install openai==1.51.0\nclient = OpenAI(base_url=\"http://localhost:7997/\")\nclient.embeddings.create(\n model=\"laion/larger_clap_general\",\n input=[url_to_base64(url, \"audio\")],\n encoding_format=\"float\",\n extra_body={\n \"modality\": \"audio\"\n }\n)\n\nclient.embeddings.create(\n model=\"laion/larger_clap_general\",\n input=[\"the sound of a beep\", \"the sound of a cat\"],\n encoding_format=\"base64\", # base64: optional high performance setting\n extra_body={\n \"modality\": \"text\"\n }\n)\n```\n\n### Hint: Run all the above models on one server:\n```bash\ninfinity_emb v2 --model-id BAAI/bge-small-en-v1.5 --model-id openai/clip-vit-base-patch32 --model-id laion/larger_clap_general\n```","operationId":"embeddings","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/MultiModalOpenAIEmbedding"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/rerank":{"post":{"summary":" Rerank","description":"Rerank documents. Aligned with Cohere API (https://docs.cohere.com/reference/rerank)\n\n```python\nimport requests\nrequests.post(\"http://..:7997/rerank\",\n json={\n \"model\":\"mixedbread-ai/mxbai-rerank-xsmall-v1\",\n \"query\":\"Where is Munich?\",\n \"documents\":[\"Munich is in Germany.\", \"The sky is blue.\"]\n })\n```","operationId":"rerank","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RerankInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ReRankResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/classify":{"post":{"summary":" Classify","description":"Score or Classify Sentiments\n\n```python\nimport requests\nrequests.post(\"http://..:7997/classify\",\n json={\"model\":\"SamLowe/roberta-base-go_emotions\",\"input\":[\"I am not having a great day.\"]})\n```","operationId":"classify","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/embeddings_image":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `image`","description":"Encode Embeddings from Image files\n\nSupports URLs of Images and Base64-encoded Images\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_image\",\n json={\n \"model\":\"openai/clip-vit-base-patch32\",\n \"input\": [\n \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n \"data:image/png;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDIMAGE\"\n ]\n })\n```","operationId":"embeddings_image","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ImageEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/embeddings_audio":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `audio`","description":"Encode Embeddings from Audio files\n\nSupports URLs of Audios and Base64-encoded Audios\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_audio\",\n json={\n \"model\":\"laion/larger_clap_general\",\n \"input\": [\n \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\",\n \"data:audio/wav;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDAUDIO\"\n ]\n })\n```","operationId":"embeddings_audio","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/AudioEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/metrics":{"get":{"summary":"Metrics","description":"Endpoint that serves Prometheus metrics.","operationId":"metrics_metrics_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}}},"components":{"schemas":{"AudioEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"AudioEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ClassifyInput":{"properties":{"input":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false}},"type":"object","required":["input"],"title":"ClassifyInput"},"ClassifyResult":{"properties":{"object":{"type":"string","enum":["classify"],"const":"classify","title":"Object","default":"classify"},"data":{"items":{"items":{"$ref":"#/components/schemas/_ClassifyObject"},"type":"array"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"ClassifyResult","description":"Result of classification."},"EmbeddingEncodingFormat":{"type":"string","enum":["float","base64"],"title":"EmbeddingEncodingFormat"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ImageEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"ImageEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ModelInfo":{"properties":{"id":{"type":"string","title":"Id"},"stats":{"type":"object","title":"Stats"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"owned_by":{"type":"string","enum":["infinity"],"const":"infinity","title":"Owned By","default":"infinity"},"created":{"type":"integer","title":"Created"},"backend":{"type":"string","title":"Backend","default":""},"capabilities":{"items":{"type":"string"},"type":"array","uniqueItems":true,"title":"Capabilities","default":[]}},"type":"object","required":["id","stats"],"title":"ModelInfo"},"MultiModalOpenAIEmbedding":{"oneOf":[{"$ref":"#/components/schemas/_OpenAIEmbeddingInput_Text"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Audio"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Image"}],"title":"MultiModalOpenAIEmbedding"},"OpenAIEmbeddingInput_Audio":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["audio"],"const":"audio","title":"Modality","default":"audio"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Audio"},"OpenAIEmbeddingInput_Image":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P[\\w\\-\\+\\.]+))?(?P\\;base64)?,(?P.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["image"],"const":"image","title":"Modality","default":"image"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Image"},"OpenAIEmbeddingResult":{"properties":{"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"},"data":{"items":{"$ref":"#/components/schemas/_EmbeddingObject"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"OpenAIEmbeddingResult"},"OpenAIModelInfo":{"properties":{"data":{"items":{"$ref":"#/components/schemas/ModelInfo"},"type":"array","title":"Data"},"object":{"type":"string","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"OpenAIModelInfo"},"ReRankResult":{"properties":{"object":{"type":"string","enum":["rerank"],"const":"rerank","title":"Object","default":"rerank"},"results":{"items":{"$ref":"#/components/schemas/_ReRankObject"},"type":"array","title":"Results"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["results","model","usage"],"title":"ReRankResult","description":"Following the Cohere protocol for Rerankers."},"RerankInput":{"properties":{"query":{"type":"string","maxLength":122880,"title":"Query"},"documents":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Documents"},"return_documents":{"type":"boolean","title":"Return Documents","default":false},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false},"model":{"type":"string","title":"Model","default":"default/not-specified"},"top_n":{"anyOf":[{"type":"integer","exclusiveMinimum":0.0},{"type":"null"}],"title":"Top N"},"max_query_tokens":{"anyOf":[{"type":"integer","exclusiveMinimum":0.0},{"type":"null"}],"title":"Max Query Tokens","description":"Head-truncate the query to at most N tokens before scoring. Clamped to the model's server-side ceiling: a request may lower this but not raise it above the configured limit. Omit or null to use the server ceiling."},"max_tokens_per_doc":{"anyOf":[{"type":"integer","exclusiveMinimum":0.0},{"type":"null"}],"title":"Max Tokens Per Doc","description":"Head-truncate each document to at most N tokens before scoring (Cohere v2 compatible). Clamped to the model's server-side ceiling: a request may lower this but not raise it. Omit or null to use the server ceiling."},"max_pair_tokens":{"anyOf":[{"type":"integer","exclusiveMinimum":0.0},{"type":"null"}],"title":"Max Pair Tokens","description":"Cap the joined (query, document) pair to at most N tokens, trimming the longest side first. Clamped to the model's server-side ceiling: a request may lower this but not raise it. Omit or null to use the server ceiling."}},"type":"object","required":["query","documents"],"title":"RerankInput","description":"Input for reranking"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"_ClassifyObject":{"properties":{"score":{"type":"number","title":"Score"},"label":{"type":"string","title":"Label"}},"type":"object","required":["score","label"],"title":"_ClassifyObject"},"_EmbeddingObject":{"properties":{"object":{"type":"string","enum":["embedding"],"const":"embedding","title":"Object","default":"embedding"},"embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"string","format":"binary"},{"items":{"items":{"type":"number"},"type":"array"},"type":"array"}],"title":"Embedding"},"index":{"type":"integer","title":"Index"}},"type":"object","required":["embedding","index"],"title":"_EmbeddingObject"},"_OpenAIEmbeddingInput_Text":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1},{"type":"string","maxLength":122880}],"title":"Input"},"modality":{"type":"string","enum":["text"],"const":"text","title":"Modality","default":"text"}},"type":"object","required":["input"],"title":"_OpenAIEmbeddingInput_Text","description":"helper"},"_ReRankObject":{"properties":{"relevance_score":{"type":"number","title":"Relevance Score"},"index":{"type":"integer","title":"Index"},"document":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Document"}},"type":"object","required":["relevance_score","index"],"title":"_ReRankObject"},"_Usage":{"properties":{"prompt_tokens":{"type":"integer","title":"Prompt Tokens"},"total_tokens":{"type":"integer","title":"Total Tokens"}},"type":"object","required":["prompt_tokens","total_tokens"],"title":"_Usage"}}}} \ No newline at end of file diff --git a/libs/infinity_emb/infinity_emb/args.py b/libs/infinity_emb/infinity_emb/args.py index fde57081d..0fcb206e7 100644 --- a/libs/infinity_emb/infinity_emb/args.py +++ b/libs/infinity_emb/infinity_emb/args.py @@ -50,11 +50,20 @@ class EngineArgs: pooling_method, PoolingMethod or str: pooling method to use. Defaults to PoolingMethod.auto. lengths_via_tokenize, bool: schedule by token usage. Defaults to False. served_model_name, str: Defaults to readable name of model_name_or_path. + max_query_tokens, Optional[int]: rerank ceiling, head-truncate the query to at most + N tokens. A client may request fewer. Defaults to None (no limit). + max_tokens_per_doc, Optional[int]: rerank ceiling, head-truncate each document to at + most N tokens. A client may request fewer. Defaults to None (no limit). + max_pair_tokens, Optional[int]: rerank ceiling on the joined (query, document) pair. + A client may request fewer. Defaults to None (no limit). """ model_name_or_path: str = MANAGER.model_id[0] batch_size: int = MANAGER.batch_size[0] revision: Optional[str] = MANAGER.revision[0] + max_query_tokens: Optional[int] = MANAGER.max_query_tokens[0] + max_tokens_per_doc: Optional[int] = MANAGER.max_tokens_per_doc[0] + max_pair_tokens: Optional[int] = MANAGER.max_pair_tokens[0] trust_remote_code: bool = MANAGER.trust_remote_code[0] engine: InferenceEngine = InferenceEngine[MANAGER.engine[0]] model_warmup: bool = MANAGER.model_warmup[0] @@ -99,6 +108,10 @@ def __post_init__(self): ) if self.revision is not None and self.revision == "": object.__setattr__(self, "revision", None) + for limit_name in ("max_query_tokens", "max_tokens_per_doc", "max_pair_tokens"): + limit = getattr(self, limit_name) + if limit is not None and limit <= 0: + raise ValueError(f"{limit_name} must be a positive integer or None, got {limit}") if isinstance(self.vector_disk_cache_path, bool): object.__setattr__( self, @@ -163,9 +176,12 @@ def from_env(cls) -> list["EngineArgs"]: embedding_dtype=embedding_dtype, served_model_name=served_model_name, onnx_disable_optimize=onnx_disable_optimize, - onnx_do_not_prefer_quantized=onnx_do_not_prefer_quantized + onnx_do_not_prefer_quantized=onnx_do_not_prefer_quantized, + max_query_tokens=max_query_tokens, + max_tokens_per_doc=max_tokens_per_doc, + max_pair_tokens=max_pair_tokens, ) - for model_name_or_path, batch_size, revision, trust_remote_code, engine, model_warmup, device, compile, bettertransformer, dtype, pooling_method, lengths_via_tokenize, embedding_dtype, served_model_name,onnx_disable_optimize,onnx_do_not_prefer_quantized in zip_longest( + for model_name_or_path, batch_size, revision, trust_remote_code, engine, model_warmup, device, compile, bettertransformer, dtype, pooling_method, lengths_via_tokenize, embedding_dtype, served_model_name,onnx_disable_optimize,onnx_do_not_prefer_quantized, max_query_tokens, max_tokens_per_doc, max_pair_tokens in zip_longest( MANAGER.model_id, MANAGER.batch_size, MANAGER.revision, @@ -181,6 +197,9 @@ def from_env(cls) -> list["EngineArgs"]: MANAGER.embedding_dtype, MANAGER.served_model_name, MANAGER.onnx_disable_optimize, - MANAGER.onnx_do_not_prefer_quantized + MANAGER.onnx_do_not_prefer_quantized, + MANAGER.max_query_tokens, + MANAGER.max_tokens_per_doc, + MANAGER.max_pair_tokens, ) ] diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py index 58b418103..8ddd9641b 100644 --- a/libs/infinity_emb/infinity_emb/cli.py +++ b/libs/infinity_emb/infinity_emb/cli.py @@ -278,6 +278,18 @@ def v2( **_construct("onnx_do_not_prefer_quantized"), help="Do not use quantized onnx models by default if available", ), + max_query_tokens: list[int] = typer.Option( + **_construct("max_query_tokens"), + help="Rerank ceiling: head-truncate the query to at most N tokens before scoring. A client may request fewer. Unset disables the limit.", + ), + max_tokens_per_doc: list[int] = typer.Option( + **_construct("max_tokens_per_doc"), + help="Rerank ceiling: head-truncate each document to at most N tokens before scoring. A client may request fewer. Unset disables the limit.", + ), + max_pair_tokens: list[int] = typer.Option( + **_construct("max_pair_tokens"), + help="Rerank ceiling on the joined (query, document) pair, in tokens. A client may request fewer. Unset disables the limit.", + ), ): """Infinity API ♾️ cli v2. MIT License. Copyright (c) 2023-now Michael Feil \n \n @@ -341,7 +353,10 @@ def v2( bettertransformer=bettertransformer, served_model_name=served_model_name, onnx_disable_optimize=onnx_disable_optimize, - onnx_do_not_prefer_quantized=onnx_do_not_prefer_quantized + onnx_do_not_prefer_quantized=onnx_do_not_prefer_quantized, + max_query_tokens=max_query_tokens, + max_tokens_per_doc=max_tokens_per_doc, + max_pair_tokens=max_pair_tokens, ) engine_args = [] diff --git a/libs/infinity_emb/infinity_emb/engine.py b/libs/infinity_emb/infinity_emb/engine.py index 153e15ba7..2f9a6e726 100644 --- a/libs/infinity_emb/infinity_emb/engine.py +++ b/libs/infinity_emb/infinity_emb/engine.py @@ -22,6 +22,17 @@ ) +def _clamp_to_ceiling(requested: Optional[int], ceiling: Optional[int]) -> Optional[int]: + """Clamp a client-requested rerank token budget to the model's startup ceiling. + + The startup ceiling guards backend stability; a client may only lower a limit (to + trade quality for speed), never raise it. ``None`` means "no constraint from that + side", so the result is the smaller of the two set values, or ``None`` if neither is. + """ + candidates = [value for value in (requested, ceiling) if value is not None] + return min(candidates) if candidates else None + + class AsyncEmbeddingEngine: """ An LLM engine that receives requests and embeds them asynchronously. @@ -164,6 +175,9 @@ async def rerank( docs: list[str], raw_scores: bool = False, top_n: Optional[int] = None, + max_query_tokens: Optional[int] = None, + max_tokens_per_doc: Optional[int] = None, + max_pair_tokens: Optional[int] = None, ) -> tuple[list["RerankReturnType"], int]: """rerank multiple sentences @@ -173,6 +187,16 @@ async def rerank( raw_scores (bool): return raw scores instead of sigmoid top_n (Optional[int]): number of top scores to return after reranking if top_n is None, <= 0 or out of range, all scores are returned + max_query_tokens (Optional[int]): head-truncate the query to at most N tokens. + max_tokens_per_doc (Optional[int]): head-truncate each document to at most N + tokens. + max_pair_tokens (Optional[int]): cap the joined (query, document) pair to at most + N tokens, trimming the longest side first. + + Each token budget is clamped to the model's startup ceiling + (``EngineArgs.max_*``): a client may lower a limit to trade quality for speed but + cannot raise it above the configured ceiling. None on both the request and the + ceiling disables that limit. Raises: ValueError: raised if engine is not started yet @@ -189,6 +213,15 @@ async def rerank( docs=docs, raw_scores=raw_scores, top_n=top_n, + max_query_tokens=_clamp_to_ceiling( + max_query_tokens, self._engine_args.max_query_tokens + ), + max_tokens_per_doc=_clamp_to_ceiling( + max_tokens_per_doc, self._engine_args.max_tokens_per_doc + ), + max_pair_tokens=_clamp_to_ceiling( + max_pair_tokens, self._engine_args.max_pair_tokens + ), ) return scores, usage @@ -351,6 +384,9 @@ async def rerank( docs: list[str], raw_scores: bool = False, top_n: Optional[int] = None, + max_query_tokens: Optional[int] = None, + max_tokens_per_doc: Optional[int] = None, + max_pair_tokens: Optional[int] = None, ) -> tuple[list["RerankReturnType"], int]: """rerank multiple sentences @@ -360,6 +396,12 @@ async def rerank( docs (list[str]): docs to be reranked raw_scores (bool): return raw scores instead of sigmoid top_n (Optional[int]): number of top scores to return after reranking + max_query_tokens (Optional[int]): head-truncate the query to at most N tokens. + max_tokens_per_doc (Optional[int]): head-truncate each document to at most N + tokens. + max_pair_tokens (Optional[int]): cap the joined (query, document) pair to at + most N tokens, trimming the longest side first. Each budget is clamped to + the model's startup ceiling; a client may lower a limit but not raise it. Raises: ValueError: raised if engine is not started yet @@ -370,7 +412,15 @@ async def rerank( list[float]: list of scores int: token usage """ - return await self[model].rerank(query=query, docs=docs, raw_scores=raw_scores, top_n=top_n) + return await self[model].rerank( + query=query, + docs=docs, + raw_scores=raw_scores, + top_n=top_n, + max_query_tokens=max_query_tokens, + max_tokens_per_doc=max_tokens_per_doc, + max_pair_tokens=max_pair_tokens, + ) async def classify( self, *, model: str, sentences: list[str], raw_scores: bool = False diff --git a/libs/infinity_emb/infinity_emb/env.py b/libs/infinity_emb/infinity_emb/env.py index 48833e473..be29bdadf 100644 --- a/libs/infinity_emb/infinity_emb/env.py +++ b/libs/infinity_emb/infinity_emb/env.py @@ -6,7 +6,7 @@ import os from functools import cached_property from pathlib import Path -from typing import TypeVar +from typing import Optional, TypeVar from infinity_emb.log_handler import logger from infinity_emb.primitives import ( @@ -87,6 +87,21 @@ def _to_bool_multiple(value: list[str]) -> list[bool]: def _to_int_multiple(value: list[str]) -> list[int]: return [int(v) for v in value] + @staticmethod + def _to_optional_int_multiple(value: list[str]) -> list[Optional[int]]: + """Parse a per-model list where an empty token or `none`/`null` disables the limit.""" + parsed: list[Optional[int]] = [] + for v in value: + v = v.strip() + if v == "" or v.lower() in {"none", "null"}: + parsed.append(None) + continue + number = int(v) + if number <= 0: + raise ValueError(f"token limit must be a positive integer, got `{v}`") + parsed.append(number) + return parsed + @cached_property def api_key(self): return self._optional_infinity_var("api_key", default="") @@ -107,6 +122,24 @@ def batch_size(self): self._optional_infinity_var_multiple("batch_size", default=["32"]) ) + @cached_property + def max_query_tokens(self): + return self._to_optional_int_multiple( + self._optional_infinity_var_multiple("max_query_tokens", default=[""]) + ) + + @cached_property + def max_tokens_per_doc(self): + return self._to_optional_int_multiple( + self._optional_infinity_var_multiple("max_tokens_per_doc", default=[""]) + ) + + @cached_property + def max_pair_tokens(self): + return self._to_optional_int_multiple( + self._optional_infinity_var_multiple("max_pair_tokens", default=[""]) + ) + @cached_property def revision(self): return self._optional_infinity_var_multiple("revision", default=[""]) diff --git a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py index 94c3c3d6a..87827316a 100644 --- a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py +++ b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py @@ -13,7 +13,10 @@ from infinity_emb._optional_imports import CHECK_PYDANTIC -from infinity_emb.primitives import EmbeddingEncodingFormat, Modality +from infinity_emb.primitives import ( + EmbeddingEncodingFormat, + Modality, +) CHECK_PYDANTIC.mark_required() # pydantic 2.x is strictly needed starting v0.0.70 @@ -230,6 +233,33 @@ class RerankInput(BaseModel): raw_scores: bool = False model: str = "default/not-specified" top_n: Optional[int] = Field(default=None, gt=0) + max_query_tokens: Optional[int] = Field( + default=None, + gt=0, + description=( + "Head-truncate the query to at most N tokens before scoring. Clamped to the " + "model's server-side ceiling: a request may lower this but not raise it above " + "the configured limit. Omit or null to use the server ceiling." + ), + ) + max_tokens_per_doc: Optional[int] = Field( + default=None, + gt=0, + description=( + "Head-truncate each document to at most N tokens before scoring (Cohere v2 " + "compatible). Clamped to the model's server-side ceiling: a request may lower " + "this but not raise it. Omit or null to use the server ceiling." + ), + ) + max_pair_tokens: Optional[int] = Field( + default=None, + gt=0, + description=( + "Cap the joined (query, document) pair to at most N tokens, trimming the longest " + "side first. Clamped to the model's server-side ceiling: a request may lower " + "this but not raise it. Omit or null to use the server ceiling." + ), + ) class _ReRankObject(BaseModel): diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py index 1cb48aa9c..ae5a9328a 100644 --- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py +++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py @@ -30,6 +30,7 @@ OverloadStatus, PredictSingle, PrioritizedQueueItem, + RerankLimits, RerankReturnType, ReRankSingle, get_inner_item, @@ -180,15 +181,25 @@ async def rerank( docs: list[str], raw_scores: bool = False, top_n: Optional[int] = None, + max_query_tokens: Optional[int] = None, + max_tokens_per_doc: Optional[int] = None, + max_pair_tokens: Optional[int] = None, ) -> tuple[list[RerankReturnType], int]: """Schedule a query to be reranked with documents. Awaits until reranked. + The token budgets here are already resolved (the engine has clamped them to the + model's startup ceilings); ``None`` disables the corresponding limit. + Args: query (str): query for reranking docs (list[str]): documents to be reranked raw_scores (bool): return raw scores instead of sigmoid top_n (Optional[int]): number of top scores to return after reranking if top_n is None, <= 0 or out of range, all scores are returned + max_query_tokens (Optional[int]): head-truncate the query to N tokens. + max_tokens_per_doc (Optional[int]): head-truncate each document to N tokens. + max_pair_tokens (Optional[int]): cap the joined (query, document) pair to N + tokens, trimming the longest side first. Raises: ModelNotDeployedError: If loaded model does not expose `rerank` @@ -202,7 +213,12 @@ async def rerank( raise ModelNotDeployedError( "the loaded moded cannot fullyfill `rerank`. " f"Options are {self.capabilities}." ) - rerankables = [ReRankSingle(query=query, document=doc) for doc in docs] + limits = RerankLimits( + max_query_tokens=max_query_tokens, + max_tokens_per_doc=max_tokens_per_doc, + max_pair_tokens=max_pair_tokens, + ) + rerankables = [ReRankSingle(query=query, document=doc, limits=limits) for doc in docs] scores, usage = await self._schedule(rerankables) if not raw_scores: diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py index 382cc6e8e..d491819f5 100644 --- a/libs/infinity_emb/infinity_emb/infinity_server.py +++ b/libs/infinity_emb/infinity_emb/infinity_server.py @@ -423,6 +423,9 @@ async def _rerank(data: RerankInput): docs=data.documents, raw_scores=data.raw_scores, top_n=data.top_n, + max_query_tokens=data.max_query_tokens, + max_tokens_per_doc=data.max_tokens_per_doc, + max_pair_tokens=data.max_pair_tokens, ) duration = (time.perf_counter() - start) * 1000 diff --git a/libs/infinity_emb/infinity_emb/primitives.py b/libs/infinity_emb/infinity_emb/primitives.py index 843b5f048..88d1ada3f 100644 --- a/libs/infinity_emb/infinity_emb/primitives.py +++ b/libs/infinity_emb/infinity_emb/primitives.py @@ -22,6 +22,7 @@ Any, Generic, Literal, + NamedTuple, Optional, Type, TypedDict, @@ -223,16 +224,49 @@ def to_input(self) -> str: return self.sentence +class RerankLimits(NamedTuple): + """Effective per-pair token budgets for one rerank request. + + Applied per (query, document) pair before scoring so the backend does not blow up on + oversized inputs. The per-axis caps (query, doc) are head truncations that protect the + split; ``max_pair_tokens`` is the hard ceiling on the joined sequence length (i.e. on + model cost). ``None`` disables a given limit. + + These are the already-resolved values for a single request: the engine clamps the + client-requested budgets to the model's startup ceilings (``EngineArgs.max_*``) before + building this tuple. There is no global default any more — a sensible ceiling depends + on the model, so it is configured per model at startup. + """ + + max_query_tokens: Optional[int] = None + max_tokens_per_doc: Optional[int] = None + max_pair_tokens: Optional[int] = None + + @dataclass(**dataclass_args) class ReRankSingle(AbstractSingle): query: str document: str + # Effective token budgets for this pair; the query and document are each head-truncated + # and the joined pair is capped. Already resolved against the model's startup ceilings; + # an all-None RerankLimits means no truncation. + limits: RerankLimits = RerankLimits() def str_repr(self) -> str: - return self.query + self.document - - def to_input(self) -> tuple[str, str]: - return self.query, self.document + # The limits change the tokenised pair, so they must be part of the cache key + # (vector_disk_cache keys on str_repr); otherwise a capped result could be served + # for a later uncapped request for the same pair. Keep the no-limit path byte-for-byte + # identical to avoid perturbing length estimation and existing cache entries. + if self.limits == RerankLimits(): + return self.query + self.document + return ( + f"{self.query}{self.document}" + f"|rerank_limits={self.limits.max_query_tokens}," + f"{self.limits.max_tokens_per_doc},{self.limits.max_pair_tokens}" + ) + + def to_input(self) -> tuple[str, str, RerankLimits]: + return self.query, self.document, self.limits @dataclass(**dataclass_args) diff --git a/libs/infinity_emb/infinity_emb/sync_engine.py b/libs/infinity_emb/infinity_emb/sync_engine.py index deda66d93..26556e1c8 100644 --- a/libs/infinity_emb/infinity_emb/sync_engine.py +++ b/libs/infinity_emb/infinity_emb/sync_engine.py @@ -189,6 +189,9 @@ def rerank( docs: list[str], raw_scores: bool = False, top_n: Optional[int] = None, + max_query_tokens: Optional[int] = None, + max_tokens_per_doc: Optional[int] = None, + max_pair_tokens: Optional[int] = None, ): """sync interface of AsyncEngineArray""" return self.async_run( @@ -198,6 +201,9 @@ def rerank( docs=docs, raw_scores=raw_scores, top_n=top_n, + max_query_tokens=max_query_tokens, + max_tokens_per_doc=max_tokens_per_doc, + max_pair_tokens=max_pair_tokens, ) @add_start_docstrings(AsyncEngineArray.classify.__doc__) diff --git a/libs/infinity_emb/infinity_emb/transformer/crossencoder/__init__.py b/libs/infinity_emb/infinity_emb/transformer/crossencoder/__init__.py index e69de29bb..9eb24627c 100644 --- a/libs/infinity_emb/infinity_emb/transformer/crossencoder/__init__.py +++ b/libs/infinity_emb/infinity_emb/transformer/crossencoder/__init__.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2023-now michaelfeil + +from __future__ import annotations + +from typing import Any, Optional, Sequence + +__all__ = ["truncate_texts_to_tokens"] + + +def truncate_texts_to_tokens( + tokenizer: Any, + texts: Sequence[str], + max_tokens: Sequence[Optional[int]], +) -> list[str]: + """Head-truncate each text to its first ``max_tokens[i]`` tokens. + + Used for both the query and each document before the (query, document) pair is built. + A text is returned unchanged when its cap is ``None``/non-positive or it already fits, + so text that does not need shortening never makes a lossy decode round-trip. + + Run this from the single preprocessing thread (i.e. inside ``encode_pre``) with that + model's own ``tokenizer``; the token-counting ``_infinity_tokenizer`` runs on a + different thread and must not be shared. + """ + positive_caps = [n for n in max_tokens if n and n > 0] + if not positive_caps: + return list(texts) + + # Bound the tokenisation cost: a capped text only needs ``cap + 1`` tokens to detect + # that it overflows, and an uncapped text is returned unchanged (its ids are discarded). + # Truncating the batch to the largest cap + 1 avoids fully tokenising oversized inputs, + # which is the OOM the limits exist to prevent. + token_ids = tokenizer( + list(texts), + add_special_tokens=False, + truncation=True, + max_length=max(positive_caps) + 1, + return_attention_mask=False, + return_token_type_ids=False, + )["input_ids"] + + truncated: list[str] = [] + for original, ids, cap in zip(texts, token_ids, max_tokens): + if cap and cap > 0 and len(ids) > cap: + truncated.append(tokenizer.decode(ids[:cap], skip_special_tokens=True)) + else: + truncated.append(original) + return truncated diff --git a/libs/infinity_emb/infinity_emb/transformer/crossencoder/optimum.py b/libs/infinity_emb/infinity_emb/transformer/crossencoder/optimum.py index dfcb155fc..68abc46ec 100644 --- a/libs/infinity_emb/infinity_emb/transformer/crossencoder/optimum.py +++ b/libs/infinity_emb/infinity_emb/transformer/crossencoder/optimum.py @@ -7,7 +7,9 @@ from infinity_emb._optional_imports import CHECK_ONNXRUNTIME from infinity_emb.args import EngineArgs +from infinity_emb.primitives import RerankLimits from infinity_emb.transformer.abstract import BaseCrossEncoder +from infinity_emb.transformer.crossencoder import truncate_texts_to_tokens from infinity_emb.transformer.utils_optimum import ( device_to_onnx, get_onnx_files, @@ -58,15 +60,42 @@ def __init__(self, *, engine_args: EngineArgs): ) self._infinity_tokenizer = copy.deepcopy(self.tokenizer) - def encode_pre(self, queries_docs: list[tuple[str, str]]) -> dict[str, np.ndarray]: - encoded = self.tokenizer( - queries_docs, - max_length=self.config.max_position_embeddings, - padding=True, - truncation="longest_first", - return_tensors="np", - return_token_type_ids=False, + def encode_pre( + self, queries_docs: list[tuple[str, str, RerankLimits]] + ) -> dict[str, np.ndarray]: + queries = [t[0] for t in queries_docs] + documents = [t[1] for t in queries_docs] + limits = [t[2] if len(t) > 2 else RerankLimits() for t in queries_docs] + + model_max = ( + getattr(self.config, "max_position_embeddings", None) + or self.tokenizer.model_max_length + ) + + def pair_max_length(limit: RerankLimits) -> int: + if (limit.max_pair_tokens or 0) > 0: + return min(limit.max_pair_tokens, model_max) + return model_max + + # 1) head-truncate the query and the document independently, then + # 2) cap the joined pair (longest side trimmed first) to max_pair_tokens. + queries = truncate_texts_to_tokens( + self.tokenizer, queries, [lim.max_query_tokens for lim in limits] + ) + documents = truncate_texts_to_tokens( + self.tokenizer, documents, [lim.max_tokens_per_doc for lim in limits] ) + encodings = [ + self.tokenizer( + q, + d, + max_length=pair_max_length(lim), + truncation="longest_first", + return_token_type_ids=False, + ) + for q, d, lim in zip(queries, documents, limits) + ] + encoded = self.tokenizer.pad(encodings, padding=True, return_tensors="np") # Windows requires int64 encoded = {k: v.astype(np.int64) for k, v in encoded.items()} return encoded diff --git a/libs/infinity_emb/infinity_emb/transformer/crossencoder/torch.py b/libs/infinity_emb/infinity_emb/transformer/crossencoder/torch.py index ae6d65750..46cb7b022 100644 --- a/libs/infinity_emb/infinity_emb/transformer/crossencoder/torch.py +++ b/libs/infinity_emb/infinity_emb/transformer/crossencoder/torch.py @@ -9,8 +9,9 @@ from infinity_emb._optional_imports import CHECK_SENTENCE_TRANSFORMERS, CHECK_TORCH from infinity_emb.args import EngineArgs from infinity_emb.log_handler import logger -from infinity_emb.primitives import Device +from infinity_emb.primitives import Device, RerankLimits from infinity_emb.transformer.abstract import BaseCrossEncoder +from infinity_emb.transformer.crossencoder import truncate_texts_to_tokens from infinity_emb.transformer.quantization.interface import ( quant_interface, ) @@ -88,14 +89,43 @@ def __init__(self, *, engine_args: EngineArgs): logger.info("using torch.compile(dynamic=True)") self.model = torch.compile(self.model, dynamic=True) - def encode_pre(self, input_tuples: list[tuple[str, str]]): + def encode_pre(self, input_tuples: list[tuple[str, str, RerankLimits]]): # return input_tuples - texts = [[t[0].strip(), t[1].strip()] for t in input_tuples] + queries = [t[0].strip() for t in input_tuples] + documents = [t[1].strip() for t in input_tuples] + limits = [t[2] if len(t) > 2 else RerankLimits() for t in input_tuples] - tokenized = self.tokenizer( - texts, padding=True, truncation="longest_first", return_tensors="pt" + model_max = ( + getattr(self.model.config, "max_position_embeddings", None) + or self.tokenizer.model_max_length ) - return tokenized + + def pair_max_length(limit: RerankLimits) -> int: + # Always cap at the model's positional limit: max_pair_tokens may exceed it + # (e.g. a 768-token request against a 512-position model), which would produce + # a sequence the model cannot process in encode_core. + if (limit.max_pair_tokens or 0) > 0: + return min(limit.max_pair_tokens, model_max) + return model_max + + # 1) head-truncate the query and the document independently, then + # 2) cap the joined pair (longest side trimmed first) to max_pair_tokens. + queries = truncate_texts_to_tokens( + self.tokenizer, queries, [lim.max_query_tokens for lim in limits] + ) + documents = truncate_texts_to_tokens( + self.tokenizer, documents, [lim.max_tokens_per_doc for lim in limits] + ) + encodings = [ + self.tokenizer( + q, + d, + truncation="longest_first", + max_length=pair_max_length(lim), + ) + for q, d, lim in zip(queries, documents, limits) + ] + return self.tokenizer.pad(encodings, padding=True, return_tensors="pt") def encode_core(self, features: dict[str, "Tensor"]): """ diff --git a/libs/infinity_emb/tests/unit_test/test_args.py b/libs/infinity_emb/tests/unit_test/test_args.py index ec08cecc5..86ea7a628 100644 --- a/libs/infinity_emb/tests/unit_test/test_args.py +++ b/libs/infinity_emb/tests/unit_test/test_args.py @@ -1,9 +1,48 @@ +import pytest + from infinity_emb.args import EngineArgs +from infinity_emb.engine import _clamp_to_ceiling from infinity_emb.primitives import Device, InferenceEngine def test_EngineArgs_no_input(): - EngineArgs() + args = EngineArgs() + # rerank token ceilings are unset by default (no limit) + assert args.max_query_tokens is None + assert args.max_tokens_per_doc is None + assert args.max_pair_tokens is None + + +def test_engine_args_rerank_limits(): + args = EngineArgs( + model_name_or_path="michaelfeil/bge-small-en-v1.5", + device="cpu", + max_query_tokens=64, + max_tokens_per_doc=256, + max_pair_tokens=320, + ) + assert args.max_query_tokens == 64 + assert args.max_tokens_per_doc == 256 + assert args.max_pair_tokens == 320 + + +def test_engine_args_rejects_non_positive_limit(): + with pytest.raises(ValueError): + EngineArgs(model_name_or_path="michaelfeil/bge-small-en-v1.5", max_query_tokens=0) + + +@pytest.mark.parametrize( + "requested, ceiling, expected", + [ + (None, None, None), # no limit at all + (None, 256, 256), # client unset -> use the server ceiling + (128, None, 128), # no ceiling -> use the request as-is + (128, 256, 128), # client lowers below the ceiling + (512, 256, 256), # client cannot exceed the ceiling -> clamped down + ], +) +def test_clamp_to_ceiling(requested, ceiling, expected): + assert _clamp_to_ceiling(requested, ceiling) == expected def test_engine_args(): diff --git a/libs/infinity_emb/tests/unit_test/test_primitives.py b/libs/infinity_emb/tests/unit_test/test_primitives.py new file mode 100644 index 000000000..74d0d23ea --- /dev/null +++ b/libs/infinity_emb/tests/unit_test/test_primitives.py @@ -0,0 +1,20 @@ +from infinity_emb.primitives import RerankLimits, ReRankSingle + + +def test_rerank_single_str_repr_default_is_query_plus_document(): + # the no-limit path must stay byte-for-byte identical to query + document so existing + # cache entries and length estimates are unaffected. + single = ReRankSingle(query="where is paris", document="paris is in france") + assert single.str_repr() == "where is parisparis is in france" + + +def test_rerank_single_str_repr_differs_per_limits(): + # the cache keys on str_repr, so two requests for the same pair but different limits + # must produce different keys (otherwise a capped result is served for an uncapped one). + pair = dict(query="q", document="d") + default = ReRankSingle(**pair) + capped = ReRankSingle(**pair, limits=RerankLimits(max_pair_tokens=32)) + capped_more = ReRankSingle(**pair, limits=RerankLimits(max_pair_tokens=64)) + + assert capped.str_repr() != default.str_repr() + assert capped.str_repr() != capped_more.str_repr() diff --git a/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py b/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py index cf190ebc1..edaf02da6 100644 --- a/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py +++ b/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py @@ -42,6 +42,46 @@ def test_crossencoder(): assert rankings[0] > rankings[1] > rankings[2] +def test_crossencoder_rerank_limits(): + from infinity_emb.primitives import RerankLimits + + model = CrossEncoderPatched( + engine_args=EngineArgs( + model_name_or_path="mixedbread-ai/mxbai-rerank-xsmall-v1", + compile=SHOULD_TORCH_COMPILE, + device=device, + ) + ) + + query = "Where is Paris? " * 100 + long_doc = "Paris is the capital of France. " * 200 + + model_max = ( + getattr(model.model.config, "max_position_embeddings", None) + or model.tokenizer.model_max_length + ) + + no_limits = RerankLimits(None, None, None) + uncapped = model.encode_pre([(query, long_doc, no_limits)]) + doc_capped = model.encode_pre([(query, long_doc, RerankLimits(None, 32, None))]) + pair_capped = model.encode_pre([(query, long_doc, RerankLimits(None, None, 48))]) + all_capped = model.encode_pre([(query, long_doc, RerankLimits(16, 32, 40))]) + over_model_max = model.encode_pre([(query, long_doc, RerankLimits(None, None, model_max * 10))]) + legacy_pair = model.encode_pre([(query, long_doc)]) # 2-tuple -> RerankLimits() (no caps) + + # capping the document shortens the scored sequence vs no limits at all. + assert doc_capped["input_ids"].shape[1] < uncapped["input_ids"].shape[1] + # the pair cap is a hard ceiling on the joined sequence (plus no extra). + assert pair_capped["input_ids"].shape[1] <= 48 + assert all_capped["input_ids"].shape[1] <= 40 + # no limit and an over-large pair cap are both clamped to the model's positional limit, + # so neither ever exceeds what the model can process in encode_core. + assert uncapped["input_ids"].shape[1] <= model_max + assert over_model_max["input_ids"].shape[1] <= model_max + # a 2-tuple still works (RerankLimits() now applies no truncation). + assert legacy_pair["input_ids"].shape[1] > 0 + + def test_patched_crossencoder_vs_sentence_transformers(): model = CrossEncoderPatched( engine_args=EngineArgs(