cherry-studio/packages/catalog/data/models.json
suyao 2593a427e0
refactor: update schemas and models for consistency and clarity
- Changed `perMillionTokens` to `per_million_tokens` in PricePerTokenSchema for snake_case consistency.
- Removed unused types from index.ts and simplified ProviderModelOverrideSchema by removing deprecated fields.
- Enhanced ModelConfigSchema to enforce unique capabilities and modalities, and made context_window and max_output_tokens optional.
- Updated ProviderConfigSchema to require at least one supported endpoint.
- Removed commented-out code and unused imports in route.ts for cleaner code.
- Added a cleanup script to remove deprecated fields from overrides.json.
- Implemented a new importer for AIHubMix models, transforming API data into the internal format.
- Created a utility for applying and validating model overrides, ensuring better error handling and warnings.
- Updated various scripts for better organization and clarity, including removing search models and generating AIHubMix models.
2025-12-08 22:47:16 +08:00

15035 lines
432 KiB
JSON
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"version": "2025.12.08",
"models": [
{
"id": "claude-opus-4-5",
"description": "Claude Opus 4.5 is Anthropics latest frontier reasoning model, optimized for complex engineering, agentic workflows, and long-horizon computer use. It features strong multimodal capabilities, improved resistance to prompt injection, and a new Verbosity parameter to control token efficiency. With advanced tool use, extended context, and multi-agent support, Opus 4.5 excels in autonomous research, debugging, planning, and spreadsheet/browser operations.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 5,
"currency": "USD"
},
"output": {
"per_million_tokens": 25,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"cache_write": {
"per_million_tokens": 6.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gemini-3-pro-image-preview",
"description": "Gemini-3-Pro-Image-Preview (Nano Banana Pro) is a high-performance image generation and editing model built on Gemini 3 Pro. It delivers enhanced multimodal understanding and real-world semantic reasoning, enabling fast creation of well-structured visual content such as infographics, product sketches, and multi-subject scenes. It can also leverage real-time knowledge through Search grounding. The model excels in text rendering, consistent multi-image blending, and identity preservation, while offering fine-grained creative controls like localized edits, lighting and focus adjustments, camera transformations, and flexible aspect ratios. Its ideal for rapid design, concept previews, product visualization, and everyday image generation workflows.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 12,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "gemini-3-pro-preview",
"description": "google state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT",
"WEB_SEARCH"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 12,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs",
"web",
"deepsearch",
"long_context"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs,web,deepsearch,long_context"
}
},
{
"id": "gpt-5.1",
"description": "GPT-5 is OpenAIs most advanced language model, designed for complex tasks that require step-by-step reasoning, precise instruction following, and high reliability. It improves reasoning, code generation, and prompt understanding—including test-time routing and intent cues like “think hard about this”—while reducing hallucination and sycophancy.",
"capabilities": [
"REASONING",
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.125,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"web",
"tools",
"deepsearch",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,web,tools,deepsearch,function_calling,structured_outputs"
}
},
{
"id": "gpt-5.1-codex-max",
"description": "GPT-5.1-Codex-Max is a frontier programming model built for the agent-driven era. Powered by an upgraded core reasoning architecture, it is specially trained for complex agentic tasks in software engineering, mathematics, and scientific research. It delivers faster performance, greater stability, and higher token efficiency across the entire development lifecycle, including code generation, refactoring, debugging, and engineering collaboration. With native support for multiple context windows and a built-in compaction mechanism, the model can coherently process millions of tokens within a single task.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT",
"REASONING"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.125,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"function_calling",
"structured_outputs",
"thinking"
],
"original_types": "llm",
"original_features": "function_calling,structured_outputs,thinking"
}
},
{
"id": "gemini-3-pro-preview-search",
"description": "Gemini-3-pro-search integrates Google's official search functionality; the search feature incurs an additional separate fee log directly incorporated into the scoring, but the log details are not displayed; this will be fixed in the future to show the details; it only supports OpenAI-compatible format calls and does not support the Gemini SDK; for the Gemini native SDK, please directly set the official search parameters.",
"capabilities": [
"REASONING",
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 12,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"web",
"deepsearch",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "thinking,web,deepsearch,tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "gpt-5.1-chat-latest",
"description": "GPT-5.1 Chat refers to the GPT-5.1 snapshot currently used in ChatGPT and is optimized for conversational use cases. While GPT-5.1 is recommended for most API applications, GPT-5.1 Chat is ideal for testing the latest improvements in chat-based interactions.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 128000,
"max_output_tokens": 16384,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.125,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "function_calling,structured_outputs"
}
},
{
"id": "gpt-5.1-codex",
"description": "GPT-5.1-Codex is a version of GPT-5 optimized for agentic coding tasks in Codex or similar environments. It's available in the Responses API only and the underlying model snapshot will be regularly updated. ",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.125,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,function_calling,structured_outputs"
}
},
{
"id": "gpt-5.1-codex-mini",
"description": "GPT-5.1 Codex mini is a smaller, more cost-effective, less-capable version of GPT-5.1-Codex.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 0.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.025,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,function_calling,structured_outputs"
}
},
{
"id": "mistral-large-3",
"description": "Mistral Large 3 is a MoE model with 67.5B total parameters and 41B active parameters, supporting a 256K-token context window. Trained from scratch on 3,000 NVIDIA H200 GPUs, it is one of the strongest permissively licensed open-weight models available.\n\nDesigned for advanced reasoning and long-context understanding, Mistral Large 3 delivers performance on par with the best instruction-tuned open-weight models for general-purpose tasks, while also offering image understanding capabilities. Its multilingual strengths are particularly notable for non-English/Chinese languages, making it well-suited for global applications.\n\nTypical use cases include enterprise assistants, multilingual customer support, content generation and editing, data analysis over long documents, code assistance, and research workflows that require handling large corpora or complex instructions. With its MoE architecture, Mistral Large 3 balances strong performance with efficient inference, providing a versatile backbone for building reliable, production-grade AI systems.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 256000,
"max_output_tokens": 256000,
"pricing": {
"input": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "function_calling,structured_outputs"
}
},
{
"id": "claude-haiku-4-5",
"description": "Claude Haiku 4.5 is a fast, affordable, and highly capable AI model, excelling at coding and agentic tasks. Its combination of speed and low cost makes it ideal for powering real-time applications like chatbots, high-volume free services, and specialized \"sub-agents\" for complex tasks in coding, finance, and research. It can also handle common business tasks like creating office documents and assisting with strategy and analysis.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 204800,
"max_output_tokens": 131072,
"pricing": {
"input": {
"per_million_tokens": 1.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 5.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"cache_write": {
"per_million_tokens": 1.375,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "claude-sonnet-4-5",
"description": "Sonnet 4.5 is the best model in the world for agents, coding, and computer usage. It is also our most accurate and detailed model for long-running tasks, with enhanced knowledge in coding, finance, and cybersecurity. \nThis model supports a thinking parameter to enable thinking requests in Claude mode.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 1000000,
"max_output_tokens": 64000,
"pricing": {
"input": {
"per_million_tokens": 3.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 16.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.33,
"currency": "USD"
},
"cache_write": {
"per_million_tokens": 4.125,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gemini-2.5-flash-image",
"description": "Gemini 2.5 Flash Image (Nano-Banana) is a state-of-the-art image generation and editing model that enables seamless blending of multiple images into a single composition while maintaining character consistency for rich visual storytelling. It supports precise, targeted image transformations through natural language instructions and leverages built-in world knowledge for both image generation and editing, making it well suited for creative design, content production, advertising, and visual expression workflows.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"VISION",
"TEXT"
],
"output_modalities": [
"VISION"
],
"context_window": 32800,
"max_output_tokens": 8000,
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.499,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "glm-4.6",
"description": "GLM-4.6 is Zhipus latest flagship model (total parameters 355B, activation parameters 32B), comprehensively surpassing GLM-4.5. Its coding capability is aligned with Claude Sonnet 4, making it a top domestic coding model; the context window has been expanded from 128K to 200K, better suited for long code and agent tasks; inference capabilities have been significantly enhanced and support tool invocation during processing; improvements have been made in tool calling, search agents, writing style, role play, and multilingual translation. The model is named glm-4.6 and is provided by three vendors, with calls prioritized to the Sophnet platform.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 204800,
"max_output_tokens": 131072,
"pricing": {
"input": {
"per_million_tokens": 0,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "grok-4-1-fast-non-reasoning",
"description": "Grok 4.1 is a new conversational model with significant improvements in real-world usability, delivering exceptional performance in creative, emotional, and collaborative interactions. It is more perceptive to nuanced user intent, more engaging to converse with, and more coherent in personality, while fully preserving its core intelligence and reliability. Built on large-scale reinforcement learning infrastructure, the model is optimized for style, personality, helpfulness, and alignment, and leverages frontier agentic reasoning models as reward evaluators to autonomously assess and iterate on responses at scale, significantly enhancing overall interaction quality.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 2000000,
"max_output_tokens": 2000000,
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "grok-4-1-fast-reasoning",
"description": "Grok 4.1 is a new conversational model with significant improvements in real-world usability, delivering exceptional performance in creative, emotional, and collaborative interactions. It is more perceptive to nuanced user intent, more engaging to converse with, and more coherent in personality, while fully preserving its core intelligence and reliability. Built on large-scale reinforcement learning infrastructure, the model is optimized for style, personality, helpfulness, and alignment, and leverages frontier agentic reasoning models as reward evaluators to autonomously assess and iterate on responses at scale, significantly enhancing overall interaction quality.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 2000000,
"max_output_tokens": 2000000,
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gpt-5",
"description": "GPT-5 is OpenAIs most advanced general-purpose model, delivering major improvements in reasoning, code quality, and overall user experience. It is optimized for complex tasks that require step-by-step reasoning, precise instruction following, and high accuracy in high-stakes scenarios. The model supports test-time routing and advanced prompt understanding, including user-specified intent such as “think hard about this,” while significantly reducing hallucination and sycophancy and improving performance in coding, writing, and health-related tasks.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.125,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "deepseek-v3.2",
"description": "DeepSeek-V3.2 is an efficient large language model equipped with DeepSeek Sparse Attention and reinforced reasoning performance, but its core strength lies in powerful agentic capabilities—enabled by large-scale task-synthesis that tightly integrates reasoning with real-world tool use, delivering robust, compliant, and generalizable agent behaviour. Users can toggle deeper reasoning through the reasoning_enabled switch.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"max_output_tokens": 64000,
"pricing": {
"input": {
"per_million_tokens": 0.302,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.453,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0302,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "deepseek-v3.2-speciale",
"description": "DeepSeek-V3.2-Speciale is an enhanced long-thinking variant of DeepSeek-V3.2 that integrates the theorem-proving capabilities of DeepSeek-Math-V2. It excels in instruction following, mathematical reasoning, and logical verification, achieving performance comparable to Gemini-3.0-Pro on major reasoning benchmarks and winning gold medals at IMO 2025, CMO 2025, ICPC World Finals 2025, and IOI 2025. However, due to its long-thinking mechanism, the model may overthink simple questions, so task complexity should be carefully controlled during usage. The model only supports the thinking version.",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 0.302,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.453,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0302,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking"
],
"original_types": "llm",
"original_features": "thinking"
}
},
{
"id": "deepseek-v3.2-think",
"description": "DeepSeek-V3.2 is an efficient large language model equipped with DeepSeek Sparse Attention and reinforced reasoning performance, but its core strength lies in powerful agentic capabilities—enabled by large-scale task-synthesis that tightly integrates reasoning with real-world tool use, delivering robust, compliant, and generalizable agent behaviour. Users can toggle deeper reasoning through the reasoning_enabled switch.",
"capabilities": [
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"max_output_tokens": 64000,
"pricing": {
"input": {
"per_million_tokens": 0.302,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.453,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0302,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"web",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "web,tools,function_calling,structured_outputs"
}
},
{
"id": "deepseek-math-v2",
"description": "The mathematical reasoning of large language models has shifted from pursuing correct answers to ensuring rigorous processes. Research proposes a new paradigm of \"self-verification,\" training specialized verifiers to evaluate proof steps and using this to train generators for self-error correction. The two co-evolve, pushing the boundaries of capability. Ultimately, the model achieves gold medal level in top competitions like the IMO, demonstrating the great potential of deep reasoning.",
"capabilities": [
"REASONING",
"WEB_SEARCH"
],
"input_modalities": [
"TEXT"
],
"context_window": 163000,
"max_output_tokens": 163000,
"pricing": {
"input": {
"per_million_tokens": 0.492,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.968,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0984,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"web"
],
"original_types": "llm",
"original_features": "thinking,web"
}
},
{
"id": "DeepSeek-V3.2-Exp",
"description": "The model DeepSeek-V3.2-Exp is officially named deepseek-chat on the website. It is an experimental version. As an intermediate step towards the next-generation architecture, V3.2-Exp introduces DeepSeek Sparse Attention (a sparse attention mechanism) based on V3.1-Terminus, exploring and validating",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 163000,
"max_output_tokens": 163000,
"pricing": {
"input": {
"per_million_tokens": 0.274,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.411,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0274,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "DeepSeek-V3.2-Exp-Think",
"description": "The model DeepSeek-V3.2-Exp-Think is officially named deepseek-reasoner. It is an experimental version. As an intermediate step towards the next-generation architecture, V3.2-Exp introduces DeepSeek Sparse Attention (a sparse attention mechanism) based on V3.1-Terminus, exploring and validating exploratory optimizations for training and inference efficiency on long texts.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 131000,
"max_output_tokens": 64000,
"pricing": {
"input": {
"per_million_tokens": 0.274,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.411,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0274,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gpt-5-codex",
"description": "GPT-5-Codex is a version of GPT-5 optimized for autonomous coding tasks in Codex or similar environments. It is only available in the Responses API, and the underlying model snapshots will be updated regularly. https://docs.aihubmix.com/en/api/Responses-API You can also use it in codex-cll; see https://docs.aihubmix.com/en/api/Codex-CLI for using codex-cll through Aihubmix.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.125,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "DeepSeek-V3.1-Terminus",
"description": "DeepSeek-V3.1 non-thinking mode has now been updated to the DeepSeek-V3.1-Terminus version.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 160000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.56,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.68,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "DeepSeek-V3.1-Think",
"description": "Thinking mode of DeepSeek-V3.1; \nDeepSeek V3.1 is a text generation model provided by DeepSeek, featuring a hybrid reasoning architecture that achieves an effective integration of thinking and non-thinking modes.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.56,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.68,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gpt-5-pro",
"description": "GPT-5 pro uses more compute to think harder and provide consistently better answers.\n\nGPT-5 pro is available in the Responses API only to enable support for multi-turn model interactions before responding to API requests, and other advanced API features in the future. Since GPT-5 pro is designed to tackle tough problems, some requests may take several minutes to finish. To avoid timeouts, try using background mode. As our most advanced reasoning model, GPT-5 pro defaults to (and only supports) reasoning.effort: high. GPT-5 pro does not support code interpreter.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 15,
"currency": "USD"
},
"output": {
"per_million_tokens": 120,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gpt-5-mini",
"description": "GPT-5 mini is a faster, more cost-efficient version of GPT-5. It's great for well-defined tasks and precise prompts.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 0.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.025,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gpt-5-nano",
"description": "GPT-5-Nano is the smallest and fastest variant in the GPT-5 system, designed specifically for developer tools and environments that demand rapid interactions and ultra-low latency. While it offers a more lightweight solution with limited reasoning depth compared to its larger counterparts, GPT-5-Nano excels in core capabilities such as instruction-following and maintaining critical safety features. As the successor to GPT-4.1-nano, it provides an optimal choice for cost-sensitive or real-time applications, where efficiency and speed are paramount. Particularly well-suited for summarization and classification tasks, GPT-5-Nano is a powerful tool for developers needing a swift, reliable AI model for streamlined processes.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.005,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gpt-5-chat-latest",
"description": "GPT-5 Chat points to the GPT-5 snapshot currently used in ChatGPT. GPT-5 is our next-generation, high-intelligence flagship model. It accepts both text and image inputs, and produces text outputs.",
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 400000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.125,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "claude-opus-4-1",
"description": "Opus 4.1 is an upgraded version of Claude Opus 4, with improvements mainly in agent tasks, practical coding, and reasoning. Compared to Opus 4, there is a slight improvement in software engineering accuracy; Opus 4.1 has higher accuracy at 74.5%.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 16.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 82.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "doubao-seedream-4-5",
"description": "Seedream 4.5 is ByteDance's latest multimodal image model, integrating capabilities such as text-to-image, image-to-image, and multi-image output, along with incorporating common sense and reasoning abilities. Compared to the previous 4.0 model, it significantly improves generation quality, offering better editing consistency and multi-image fusion effects, with more precise control over image details. The generation of small text and small faces is more natural.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "sora-2",
"description": "Sora-2 is the next-generation text-to-video model evolved from Sora, optimized for higher visual realism, stronger physical consistency, and longer temporal coherence. It delivers more stable character consistency, complex motion rendering, camera control, and narrative continuity, while supporting higher resolutions and minute-level video generation for film production, advertising, virtual content creation, and creative multimedia workflows.",
"capabilities": [
"VIDEO_GENERATION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "sora-2-pro",
"description": "OpenAI video model Sora2-pro official API.",
"capabilities": [
"VIDEO_GENERATION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "gpt-4o-audio-preview",
"description": "OpenAI voice input and output model, with prices consistent with the official ones. For now, only the text portion prices are displayed; voice prices can be found on the official OpenAI website. Backend billing is the same as the official.",
"input_modalities": [
"TEXT",
"AUDIO"
],
"context_window": 128000,
"max_output_tokens": 16384,
"pricing": {
"input": {
"per_million_tokens": 2.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "gpt-4o-mini-audio-preview",
"input_modalities": [
"TEXT",
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 0.15,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "o3",
"description": "OpenAI o3 is a powerful model across multiple domains, setting a new standard for coding, math, science, and visual reasoning tasks.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 100000,
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 8,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gemini-2.5-pro",
"description": "Gemini 2.5 Pro is an advanced reasoning model developed by Google, optimized for solving highly complex problems across multiple domains. It can deeply understand large-scale information from diverse sources, including text, audio, images, video, and even entire codebases. The model demonstrates strong reasoning capabilities in coding, mathematics, and STEM-related tasks, and supports long-context analysis for large datasets, codebases, and technical documentation.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT",
"WEB_SEARCH",
"REASONING"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.31,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs",
"long_context",
"web",
"thinking",
"deepsearch"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs,long_context,web,thinking,deepsearch"
}
},
{
"id": "jimeng-3.0-1080p",
"description": "DreamVideo 3.0 Pro is a professional-grade text-to-video and image-to-video model built on the Dream framework, delivering a major breakthrough in video generation quality. This version demonstrates strong performance across multiple dimensions, including narrative coherence, instruction following, dynamic fluidity, and visual detail. It supports multi-shot storytelling and generates 1080P high-definition videos with a professional cinematic texture. The model also enables diverse and expressive stylistic rendering, making it well suited for creative production and visual storytelling.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "jimeng-3.0-720p",
"description": "DreamVideo 3.0 Pro is a professional-grade text-to-video and image-to-video model built on the Dream framework, delivering a major breakthrough in video generation quality. This version demonstrates strong performance across multiple dimensions, including narrative coherence, instruction following, dynamic fluidity, and visual detail. It supports multi-shot storytelling and generates 1080P high-definition videos with a professional cinematic texture. The model also enables diverse and expressive stylistic rendering, making it well suited for creative production and visual storytelling.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "jimeng-3.0-pro",
"description": "DreamVideo 3.0 Pro is a professional-grade text-to-video and image-to-video model built on the Dream framework, delivering a major breakthrough in video generation quality. This version demonstrates strong performance across multiple dimensions, including narrative coherence, instruction following, dynamic fluidity, and visual detail. It supports multi-shot storytelling and generates 1080P high-definition videos with a professional cinematic texture. The model also enables diverse and expressive stylistic rendering, making it well suited for creative production and visual storytelling.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "kimi-for-coding-free",
"description": "kimi-for-coding-free is a free and open version offered by AIHubMix specifically for Kimi users. To maintain stable service operations, the following usage limits apply: a maximum of 10 requests per minute, 1,000 total requests per day, and a daily quota of 5 million tokens.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 256000,
"max_output_tokens": 256000,
"pricing": {
"input": {
"per_million_tokens": 0,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "o3-pro",
"description": "o3-pro\nThis model only supports Requests API interface requests.The model's thinking time is relatively long, so the response will be slow.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 100000,
"pricing": {
"input": {
"per_million_tokens": 20,
"currency": "USD"
},
"output": {
"per_million_tokens": 80,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 20,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "wan2.2-i2v-plus",
"description": "The newly upgraded Tongyi Wanxiang 2.2 text-to-video offers higher video quality. It optimizes video generation stability and success rate, features stronger instruction-following capabilities, consistently maintains image text, portrait, and product consistency, and provides precise camera motion control.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "wan2.2-t2v-plus",
"description": "The newly upgraded Tongyi Wanxiang 2.2 text-to-video offers higher video quality. It can stably generate large-scale complex motions, supports cinematic-level visual performance and control, and features enhanced instruction-following capabilities to achieve realistic physical world reproduction.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "wan2.5-i2v-preview",
"description": "Tongyi Wanxiang 2.5 - Text-to-Video Preview features a newly upgraded technical architecture, supporting sound generation synchronized with visuals, 10-second long video generation, stronger instruction-following capabilities, and further improvements in motion ability and visual quality.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "wan2.5-t2v-preview",
"description": "Tongyi Wanxiang 2.5 - Text-to-Video Preview, newly upgraded model architecture, supports sound generation synchronized with visuals, supports 10-second long video generation, enhanced instruction compliance, improved motion capability, and further enhanced visual quality.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "web-sora-2",
"description": "This model is an unofficial reverse-engineered API of the OpenAI web version sora-2-hd, for entertainment purposes only. Charges apply regardless of generation success or failure, billed per use. Please avoid using it if you mind. It can be used via the chat interface, allowing intuitive image uploads: you can directly upload images through the chat interface as the basis for video generation.\n\nPrecise parameter control: by appending commands such as \"landscape/portrait,\" \"16:9/9:16,\" \"10 seconds/15 seconds,\" etc., at the end of the prompt, you can directly define the video's aspect ratio and duration.",
"capabilities": [
"VIDEO_GENERATION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "web-sora-2-pro",
"description": "This model is an unofficial reverse-engineered API of the OpenAI web version sora-2-hd, for entertainment purposes only. Charges apply regardless of generation success or failure, billed per use. Please avoid using it if you mind. It can be used via the chat interface, allowing intuitive image uploads: you can directly upload images through the chat interface as the basis for video generation.\n\nPrecise parameter control: by appending commands such as \"landscape/portrait,\" \"16:9/9:16,\" \"10 seconds/15 seconds,\" etc., at the end of the prompt, you can directly define the video's aspect ratio and duration.",
"capabilities": [
"VIDEO_GENERATION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "cc-glm-4.6",
"description": "for claude code",
"pricing": {
"input": {
"per_million_tokens": 0.06,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.22,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "coding-glm-4.6",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.06,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.22,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.010998,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "coding-glm-4.6-free",
"description": "coding-glm-4.6-free is the open and free version of coding-glm-4.6. To ensure stable service performance, usage limits are in place: up to 10 requests per minute, 1,000 requests per day, and a daily token allowance of 5 million.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 200000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 0,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "coding-minimax-m2",
"description": "coding-minimax-m2 is a free and open version offered by AIHubMix specifically for MiniMax users. To maintain stable service operations, the following usage limits apply: a maximum of 10 requests per minute, 1,000 total requests per day, and a daily quota of 5 million tokens.204800",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 204800,
"max_output_tokens": 13100,
"pricing": {
"input": {
"per_million_tokens": 0,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "coding-minimax-m2-free",
"description": "coding-minimax-m2-free is a free and open version offered by AIHubMix specifically for MiniMax users. To maintain stable service operations, the following usage limits apply: a maximum of 10 requests per minute, 1,000 total requests per day, and a daily quota of 5 million tokens.204800",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 204800,
"max_output_tokens": 13100,
"pricing": {
"input": {
"per_million_tokens": 0,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "doubao-seed-code-free",
"description": "doubao-seed-code-free is a free and open version offered by AIHubMix specifically for Doubao users. To maintain stable service operations, the following usage limits apply: a maximum of 10 requests per minute, 1,000 total requests per day, and a daily quota of 5 million tokens.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 256000,
"max_output_tokens": 256000,
"pricing": {
"input": {
"per_million_tokens": 0,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "flux-2-flex",
"description": "FLUX.2 is purpose-built for real-world creative production workflows. It delivers high-quality images while maintaining character and style consistency across multiple reference images, shows exceptional understanding and execution of structured prompts, and supports complex text reading and writing. It also adheres to brand guidelines, handles lighting, layout, and logo elements with stability, and enables image editing at resolutions up to 4MP — all while preserving fine details, striking a balance between creativity and professional-grade visual output.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "flux-2-pro",
"description": "FLUX.2 is purpose-built for real-world creative production workflows. It delivers high-quality images while maintaining character and style consistency across multiple reference images, shows exceptional understanding and execution of structured prompts, and supports complex text reading and writing. It also adheres to brand guidelines, handles lighting, layout, and logo elements with stability, and enables image editing at resolutions up to 4MP — all while preserving fine details, striking a balance between creativity and professional-grade visual output.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "gemini-2.5-pro-search",
"description": "gemini-2.5-pro-search integrates Google's official search functionality; the search feature will have an additional separate fee log directly incorporated into the scoring, with detailed logs not displayed; this will be fixed and displayed later; only supports OpenAI-compatible formats for invocation, does not support Gemini SDK; for Gemini's native SDK, please set parameters directly using the official search parameters.",
"capabilities": [
"REASONING",
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.31,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"thinking",
"web",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm,search",
"original_features": "thinking,web,tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "kimi-k2-thinking",
"description": "Kimi K2 Thinking is Moonshot AI's most advanced open-source inference model to date, extending the K2 series into intelligent agent and long-context inference domains. The model is built on the trillion-parameter mixture of experts (MoE) architecture introduced by Kimi K2, activating 32 billion parameters per forward pass and supporting a context window of 256,000 tokens.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 262144,
"max_output_tokens": 262144,
"pricing": {
"input": {
"per_million_tokens": 0.548,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.192,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.137,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "Kimi-K2-0905",
"description": "Kimi-K2-0905 is a large-scale Mixture of Experts (MoE) language model developed by Moonshot AI, with a total of 1 trillion parameters and 32 billion active parameters per forward pass. It supports long-context inference of up to 256k tokens, an expansion from the previous 128k.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 262144,
"max_output_tokens": 262144,
"pricing": {
"input": {
"per_million_tokens": 0.548,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.192,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "DeepSeek-V3.1-Fast",
"description": "The model provider is the Sophon platform. DeepSeek V3.1 Fast is the high-TPS speed version of DeepSeek V3.1.\nHybrid thinking mode: By modifying the chat template, a single model can simultaneously support both thinking and non-thinking modes.\nSmarter tool usage: Through post-training optimization, the models performance in tool utilization and agent tasks has improved significantly.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 163000,
"max_output_tokens": 163000,
"pricing": {
"input": {
"per_million_tokens": 1.096,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.288,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "claude-sonnet-4-0",
"description": "Claude Sonnet 4 is a significant upgrade to Sonnet 3.7, delivering superior performance in coding and reasoning with enhanced precision and control. Achieving a state-of-the-art 72.7% on SWE-bench, the model expertly balances advanced capability with computational efficiency. Key improvements include more reliable codebase navigation and complex instruction following, making it ideal for a wide range of applications, from routine coding to complex software development projects.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 1000000,
"max_output_tokens": 64000,
"pricing": {
"input": {
"per_million_tokens": 3.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 16.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.33,
"currency": "USD"
},
"cache_write": {
"per_million_tokens": 4.125,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gemini-2.5-flash",
"description": "Gemini 2.5 Flash is Googles best model in terms of both performance and cost efficiency, offering a comprehensive set of capabilities. It is the first Flash model to support visible reasoning, allowing insight into the thought process behind its responses. With its strong priceperformance ratio, the model is well suited for large-scale processing, low-latency, high-throughput tasks that require reasoning, as well as agent-based application scenarios.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.499,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "gemini-2.5-flash-preview-09-2025",
"description": "This latest 2.5 Flash model comes with improvements in two key areas we heard consistent feedback on:\n\nBetter agentic tool use: We've improved how the model uses tools, leading to better performance in more complex, agentic and multi-step applications. This model shows noticeable improvements on key agentic benchmarks, including a 5% gain on SWE-Bench Verified, compared to our last release (48.9% → 54%). More efficient: With thinking on, the model is now significantly more cost-efficient—achieving higher quality outputs while using fewer tokens, reducing latency and cost (see charts above).",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.499,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "glm-4.5v",
"description": "GLM-4.5V is a vision-language foundational model designed for multimodal agent applications. Based on a mixture-of-experts (MoE) architecture, it has 106 billion parameters and 12 billion active parameters. It delivers outstanding performance in video understanding, image question answering, OCR, and document parsing, and achieves significant improvements in front-end web encoding, basic reasoning, and spatial reasoning.",
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 64000,
"max_output_tokens": 16384,
"pricing": {
"input": {
"per_million_tokens": 0.274,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.822,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.274,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.5-flash-lite",
"description": "Gemini 2.5 Flash-Lite is a balanced model from Google, optimized for applications that require low-latency performance. It retains the practical capabilities of the Gemini 2.5 family, including configurable reasoning based on budget, integration with tools such as grounding via Google Search and code execution, multimodal input support, and an ultra-long context window of up to 1 million tokens, delivering a strong balance between efficiency, functionality, and cost.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.025,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "gemini-2.5-flash-lite-preview-09-2025",
"description": "gemini-2.5-flash-lite latest preview version",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.025,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "kimi-for-coding",
"description": "imi-for-coding is a free and open version offered by AIHubMix specifically for Kimi users. To maintain stable service operations, the following usage limits apply: a maximum of 10 requests per minute, 1,000 total requests per day, and a daily quota of 5 million tokens.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 256000,
"max_output_tokens": 256000,
"pricing": {
"input": {
"per_million_tokens": 0,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gemini-2.5-flash-nothink",
"description": "Gemini-2.5-flash defaults to thinking enabled; to disable thinking, request the name gemini-2.5-flash-nothink, which only supports OpenAI-compatible format calls and does not support Gemini SDK; for the native Gemini SDK, please set the parameter budget=0 directly.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1047576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.499,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "gemini-2.5-flash-search",
"description": "gemini-2.5-flash-search integrates Google's official search functionality; the search feature will have an additional separate fee log directly incorporated into the scoring, with detailed logs not displayed; this will be fixed and displayed later; only supports OpenAI-compatible formats for invocation, does not support Gemini SDK; for Gemini's native SDK, please set parameters directly using the official search parameters.",
"capabilities": [
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.499,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"web",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm,search",
"original_features": "web,tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "gemini-2.5-flash-preview-05-20-nothink",
"description": "Gemini-2.5-flash-preview-05-20 is enabled by default for thinking; to disable it, request the name gemini-2.5-flash-preview-05-20-nothink.Only OpenAI-compatible format calls are supported; Gemini SDK is not supported. For the native Gemini SDK, please set the parameter budget=0 directly.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.499,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "gemini-2.5-flash-preview-05-20-search",
"description": "Gemini-2.5 Flash Preview 05-20 Search integrates Google's official search functionality; the search feature will have an additional separate fee log directly integrated into the scoring deduction, with detailed logs not displayed. It will be fixed and displayed later. Only OpenAI-compatible formats are supported for invocation; Gemini SDK is not supported. For Gemini's native SDK, please set parameters directly using the official search parameters.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.499,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm,search",
"original_features": "tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "DeepSeek-V3-Fast",
"description": "V3 Ultra-Fast Version,The current price is a limited-time 50% discount and will return to the original price on July 31st. The original price is: input: $0.55/M, output: $2.2/M. The model provider is the Sophnet platform. DeepSeek V3 Fast is a high-TPS, ultra-fast version of DeepSeek V3 0324, featuring full-precision (non-quantized) performance, enhanced code and math capabilities, and faster responses!\n\nDeepSeek V3 0324 is a powerful Mixture-of-Experts (MoE) model with a total parameter count of 671B, activating 37B parameters per token.\nIt adopts Multi-Head Latent Attention (MLA) and the DeepSeekMoE architecture to achieve efficient inference and economical training costs.\nIt innovatively implements a load balancing strategy without auxiliary loss and sets multi-token prediction training targets to enhance performance.\nThe model is pre-trained on 14.8 trillion diverse, high-quality tokens and further optimized through supervised fine-tuning and reinforcement learning stages to fully realize its capabilities.\nComprehensive evaluations show that DeepSeek V3 outperforms other open-source models and rivals leading closed-source models in performance.\nThe entire training process only requires 2.788M H800 GPU hours and remains highly stable, with no irrecoverable loss spikes or rollbacks.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 32000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.56,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.24,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "veo-2.0-generate-001",
"description": "Veo 2.0 is an advanced video generation model capable of producing high-quality videos based on text or image prompts. It excels in understanding real-world physics and human motion, resulting in fluid character movements and lifelike scenes. Veo 2.0 supports various visual styles and camera control options, including lens types, angles, and motion effects. Users can generate 8-second video clips at 720p resolution.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"VIDEO"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "veo3.1",
"description": "veo3.1 reverse model, and other available model names that can be requested include: veo3.1-pro and veo3.1-components. The price is currently tentatively set to be calculated per token, approximately $0.05 per request.",
"capabilities": [
"VIDEO_GENERATION"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 200,
"currency": "USD"
},
"output": {
"per_million_tokens": 200,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "imagen-4.0",
"description": "Imagen 4 is a high-quality text-to-image model developed by Google, designed for strong visual fidelity, diverse artistic styles, and precise controllability. It delivers near photographic realism with sharp details and natural lighting while significantly reducing common artifacts such as distorted hands. The model supports a wide range of styles including photorealistic, illustration, anime, oil painting, and pixel art, and offers flexible aspect ratios for use cases from content covers to mobile wallpapers. It also enables image editing and secondary creation on existing images, provides fast and stable generation, and offers strong commercial usability with high visual quality and reliable content safety.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "imagen-4.0-fast-generate-001",
"description": "Imagen 4 is a new-generation image generation model designed to balance high-quality output, inference efficiency, and content safety. It supports image generation, digital watermarking with authenticity verification, user-configurable safety settings, and prompt enhancement via the Prompt Rewriter, while also delivering reliable person generation capabilities. The model ID is imagen-4.0-generate-001, making it suitable for professional creation, design workflows, and various generative AI applications.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "imagen-4.0-fast-generate-preview-06-06",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "imagen-4.0-generate-001",
"description": "Imagen 4 is a new-generation image generation model designed to balance high-quality output, inference efficiency, and content safety. It supports image generation, digital watermarking with authenticity verification, user-configurable safety settings, and prompt enhancement via the Prompt Rewriter, while also delivering reliable person generation capabilities. The model ID is imagen-4.0-generate-001, making it suitable for professional creation, design workflows, and various generative AI applications.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "imagen-4.0-ultra-generate-001",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "imagen-4.0-ultra",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "gpt-image-1",
"description": "Azure OpenAIs gpt-image-1 image generation API offers both text-to-image generation and image-to-image editing with text guidance capabilities.\nBefore using this API, please ensure you have the latest OpenAI package installed by running pip install -U openai.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 5,
"currency": "USD"
},
"output": {
"per_million_tokens": 40,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "gpt-image-1-mini",
"description": "OpenAI image generation model gpt-image-1-mini\nBefore use, please run pip install -U openai to upgrade to the latest openai package.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 5,
"currency": "USD"
},
"output": {
"per_million_tokens": 40,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "o4-mini",
"description": "o4-mini is a remarkably smart model for its speed and cost-efficiency. This allows it to support significantly higher usage limits than o3, making it a strong high-volume, high-throughput option for everyone with questions that benefit from reasoning.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 100000,
"pricing": {
"input": {
"per_million_tokens": 1.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 4.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.275,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tool",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tool,function_calling,structured_outputs"
}
},
{
"id": "gemini-2.5-flash-image-preview",
"description": "Aihubmix supports the gemini-2.5-flash-image-preview model; you can add extra parameters modalities=[\"text\", \"image\"] through the OpenAI-compatible chat interface; https://docs.aihubmix.com/en/api/Gemini-Guides#gemini-2-5-flash%3A-quick-task-support",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"VISION",
"TEXT"
],
"output_modalities": [
"VISION"
],
"context_window": 32800,
"max_output_tokens": 8000,
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "glm-4.5",
"description": "GLM-4.5",
"input_modalities": [
"TEXT"
],
"context_window": 131072,
"max_output_tokens": 98304,
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4.1",
"description": "The latest flagship multimodal model supports million-token context, with encoding capability (SWE-bench 54.6%) and instruction-following (Scale AI 38.3%) performance significantly surpassing GPT-4o, while reducing costs by 26%, making it suitable for complex tasks. Its automatic caching mechanism offers a 75% cost reduction on cache hits.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 1047576,
"max_output_tokens": 32768,
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 8,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "grok-4",
"description": "Grok, their latest and greatest flagship model, offers unparalleled performance in natural language, math, and reasoning the perfect jack of all trades.\nThe current pointing model version is grok-4-0709.",
"context_window": 256000,
"max_output_tokens": 64000,
"pricing": {
"input": {
"per_million_tokens": 3.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 16.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.825,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "grok-4-fast-non-reasoning",
"description": "Grok-4-fast is a cost-effective inference model developed by xAI that delivers cutting-edge performance with excellent token efficiency. The model features a 2 million token context window, advanced Web and X search capabilities, and a unified architecture supporting both \"inference\" and \"non-inference\" modes. Compared to Grok 4, it reduces thinking tokens by an average of 40% and lowers the price by 98% while achieving the same performance.",
"context_window": 2000000,
"max_output_tokens": 30000,
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "grok-4-fast-reasoning",
"description": "Grok-4-fast is a cost-effective inference model developed by xAI that delivers cutting-edge performance with excellent token efficiency. The model features a 2 million token context window, advanced Web and X search capabilities, and a unified architecture supporting both \"inference\" and \"non-inference\" modes. Compared to Grok 4, it reduces thinking tokens by an average of 40% and lowers the price by 98% while achieving the same performance.",
"context_window": 2000000,
"max_output_tokens": 30000,
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "kimi-k2-0711",
"description": "Kimi-K2 is a MoE architecture foundational model with extremely powerful coding and agent capabilities, featuring a total of 1 trillion parameters and activating 32 billion parameters. In benchmark performance tests across major categories such as general knowledge reasoning, programming, mathematics, and agents, the K2 model outperforms other mainstream open-source models.\nThe Kimi-K2 model supports a context length of 128k tokens.\nIt does not support visual capabilities.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 131000,
"max_output_tokens": 131000,
"pricing": {
"input": {
"per_million_tokens": 0.54,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.16,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "kimi-k2-turbo-preview",
"description": "The kimi-k2-turbo-preview model is a high-speed version of kimi-k2, with the same model parameters as kimi-k2, but the output speed has been increased from 10 tokens per second to 40 tokens per second.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 262144,
"max_output_tokens": 262144,
"pricing": {
"input": {
"per_million_tokens": 1.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 4.8,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-vl-235b-a22b-instruct",
"description": "The Qwen3 series open-source models include hybrid models, thinking models, and non-thinking models, with both reasoning capabilities and general abilities reaching industry SOTA levels at the same scale.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 131000,
"max_output_tokens": 33000,
"pricing": {
"input": {
"per_million_tokens": 0.274,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.096,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-vl-235b-a22b-thinking",
"description": "The Qwen3 series open-source models include hybrid models, thinking models, and non-thinking models, with both reasoning capabilities and general abilities reaching industry SOTA levels at the same scale.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 131000,
"max_output_tokens": 33000,
"pricing": {
"input": {
"per_million_tokens": 0.274,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.74,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-vl-30b-a3b-instruct",
"description": "The Qwen3-VL series second-largest MoE model Instruct version offers fast response speed and supports ultra-long contexts such as long videos and long documents; it features comprehensive upgrades in image/video understanding, spatial perception, and universal recognition abilities; it also provides visual 2DD/3D localization capabilities, making it capable of handling complex real-world tasks.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 128000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.1028,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4112,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-vl-30b-a3b-thinking",
"description": "The Qwen3-VL series second-largest MoE model Thinking version offers fast response speed, stronger multimodal understanding and reasoning, visual agent capabilities, and ultra-long context support for long videos and long documents; it features comprehensive upgrades in image/video understanding, spatial perception, and universal recognition abilities, making it capable of handling complex real-world tasks.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 128000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.1028,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.028,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "veo-3.0-generate-preview",
"description": "Veo 3.0 Generate Preview is an advanced AI video generation model that supports text-to-video creation with synchronized audio, featuring excellent physical simulation and lip-sync capabilities. Users can generate vivid video clips from short story prompts. 🎟️ Limited-Time Deal: Save 10% Now.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "veo-3.1-fast-generate-preview",
"description": "Veo 3.1 is Google's state-of-the-art model for generating high-fidelity, 8-second 720p or 1080p videos featuring stunning realism and natively generated audio.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "veo-3.1-generate-preview",
"description": "Veo 3.1 is Google's state-of-the-art model for generating high-fidelity, 8-second 720p or 1080p videos featuring stunning realism and natively generated audio.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "DeepSeek-OCR",
"description": "DeepSeek-OCR is a vision-language model launched by DeepSeek AI, focusing on optical character recognition (OCR) and “contextual optical compression.” The model is designed to explore the limits of compressing contextual information from images, efficiently processing documents and converting them into structured text formats such as Markdown. The model requires an image as input.",
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 8000,
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "flux-kontext-max",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-router",
"description": "New model routing capability; request aihubmix-router to automatically route models based on question complexity, so everyone no longer needs to manually switch models; in our tests comparing the use of the model router versus only using GPT-4.1, we observed up to 60% cost savings while maintaining similar accuracy. \nThe context length of the model router depends on the base model used for each prompt. Input size is 200,000, output size is 32,768. \nCurrently, there are four routing models: gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o4-mini. \nPricing: Due to our current billing structure system, requests through aihubmix-router are billed at the price of gpt-4.1-mini regardless of which final model is used; future billing will be based on the actual model invoked. \nEveryone is welcome to try it out; the interface will return the name of the actual called model.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "gpt-4.1-mini",
"description": "Lightweight, high-performance model with million-token context and near-flagship-level encoding and image understanding capabilities, while reducing costs by 83%. It is suitable for rapid development and small to medium-sized applications. The automatic caching mechanism provides a 75% cost reduction on cache hits.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 1047576,
"max_output_tokens": 32768,
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "qwen3-vl-plus",
"description": "The Qwen3 series visual understanding model achieves an effective fusion of thinking and non-thinking modes. Its visual agent capabilities reach world-class levels on public test sets such as OS World. This version features comprehensive upgrades in visual coding, spatial perception, and multimodal reasoning; visual perception and recognition abilities are greatly enhanced, supporting ultra-long video understanding.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 256000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.137,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.37,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0274,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "gpt-4.1-nano",
"description": "Ultra-lightweight model with million-token context, optimized for speed and low latency, costing only $0.10 per million input tokens. It is suitable for edge computing and real-time interaction. The automatic caching mechanism offers a 75% cost reduction on cache hits.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 1047576,
"max_output_tokens": 32768,
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.025,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "gemini-2.5-pro-preview-05-06",
"description": "gemini-2.5-pro latest model",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.31,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"long_context"
],
"original_types": "llm",
"original_features": "thinking,long_context"
}
},
{
"id": "gemini-2.5-pro-preview-03-25",
"description": "Supports high concurrency. \nThe Gemini 2.5 Pro preview version is here, with higher limits for production testing. \nGoogle's latest and most powerful model;",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.31,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "gemini-2.5-pro-preview-05-06-search",
"description": "Integrated with Google's official search function.",
"capabilities": [
"REASONING",
"WEB_SEARCH"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.31,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"thinking",
"web"
],
"original_types": "llm,search",
"original_features": "thinking,web"
}
},
{
"id": "gemini-2.5-pro-preview-03-25-search",
"description": "Integrated with Google's official search function.",
"capabilities": [
"REASONING",
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.31,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"thinking",
"web",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm,search",
"original_features": "thinking,web,tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "qwen3-max-preview",
"description": "Qwen3-Max-Preview is the latest preview model in the Qwen3 series. This version is functionally equivalent to Qwen3-Max-Thinking — simply set extra_body={\"enable_thinking\": True} to enable the thinking mode. Compared to the Qwen2.5 series, it delivers significant improvements in overall general capabilities, including EnglishChinese text understanding, complex instruction following, open-ended reasoning, multilingual processing, and tool-use proficiency. The model also exhibits fewer hallucinations and stronger overall reliability.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.822,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.288,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.822,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-max",
"description": "The Tongyi Qianwen 3 series Max model has undergone special upgrades in intelligent agent programming and tool invocation compared to the preview version. The officially released model this time reaches SOTA level in the field and is adapted to more complex intelligent agent scenarios.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 262144,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.822,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.288,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.822,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-next-80b-a3b-instruct",
"description": "Qwen3-Next-80B-A3B-Instruct is an instruction-tuned model in the Qwen3-Next series, optimized for delivering fast, stable, and direct final answers without showing its reasoning steps (\"thinking traces\").\n\nUnlike chain-of-thought models, it focuses on generating consistent, instruction-following outputs, making it ideal for production environments. It excels at complex tasks like reasoning and coding while maintaining high throughput and stability, especially with ultra-long inputs and multi-turn dialogues.\n\nEngineered for efficiency, its performance rivals larger Qwen3 systems, making it perfectly suited for RAG, tool use, and agentic workflows where deterministic results are critical.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 256000,
"max_output_tokens": 256000,
"pricing": {
"input": {
"per_million_tokens": 0.138,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.552,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-next-80b-a3b-thinking",
"description": "Qwen3-Next-80B-A3B-Thinking is a reasoning-first chat model in the Qwen3-Next line that excels by outputting structured 'thinking' traces (Chain-of-Thought) by default.\n\nDesigned for hard, multi-step problems, it is ideal for tasks like math proofs, code synthesis, logic puzzles, and agentic planning. Compared to other Qwen3 variants, it offers greater stability during long reasoning chains and is tuned to follow complex instructions without getting repetitive or off-task.\n\nThis model is perfectly suited for agent frameworks, tool use (function calling), and benchmarks where a step-by-step breakdown is required. It leverages throughput-oriented techniques for fast generation of detailed, procedural outputs.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 256000,
"max_output_tokens": 256000,
"pricing": {
"input": {
"per_million_tokens": 0.138,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.38,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-235b-a22b-instruct-2507",
"description": "Qwen3-235B-A22B-Instruct-2507",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 262144,
"max_output_tokens": 262144,
"pricing": {
"input": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.12,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-235b-a22b-thinking-2507",
"description": "The open-source thinking model based on Qwen3 has significantly improved in logical ability, general capability, knowledge enhancement, and creative ability compared to the previous version (Tongyi Qianwen 3-235B-A22B). It is suitable for high-difficulty and strong reasoning scenarios.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 262144,
"max_output_tokens": 262144,
"pricing": {
"input": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-coder-30b-a3b-instruct",
"description": "The code generation model based on Qwen3 has powerful Coding Agent capabilities, achieving state-of-the-art performance compared to open-source models.The model adopts tiered pricing.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 2000000,
"max_output_tokens": 262000,
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-coder-480b-a35b-instruct",
"description": "The code generation model based on Qwen3 has powerful Coding Agent capabilities, achieving state-of-the-art performance compared to open-source models.The model adopts tiered pricing.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 262000,
"max_output_tokens": 262000,
"pricing": {
"input": {
"per_million_tokens": 0.82,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.28,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.82,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-235b-a22b",
"description": "Qwen3-235B-A22B is a massive 235B parameter Mixture-of-Experts (MoE) model that operates with the efficiency of a 22B model. Its standout feature is the ability to seamlessly switch between a \"thinking\" mode for complex reasoning and a \"non-thinking\" mode for fast conversation, offering both world-class power and practical speed.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 131100,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.12,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-coder-flash",
"description": "Qwen3 Coder Flash is Alibaba's fast and cost efficient version of their proprietary Qwen3 Coder Plus. It is a powerful coding agent model specializing in autonomous programming via tool calling and environment interaction, combining coding proficiency with versatile general-purpose abilities.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 256000,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.136,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.544,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.136,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-coder-plus",
"description": "The code generation model based on Qwen3 has powerful Coding Agent capabilities, excels in tool invocation and environment interaction, and can achieve autonomous programming with outstanding coding abilities while also possessing general capabilities.The model adopts tiered pricing.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.54,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.16,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.108,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-coder-plus-2025-07-22",
"description": "The code generation model based on Qwen3 has powerful Coding Agent capabilities, excels in tool invocation and environment interaction, and can achieve autonomous programming with outstanding coding abilities while also possessing general capabilities.The model adopts tiered pricing.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 0.54,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.16,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.54,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "DeepSeek-V3",
"description": "It has been automatically upgraded to the latest released version, 250324.\nAutomatically upgraded to the latest released version 250324.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 1638000,
"max_output_tokens": 1638000,
"pricing": {
"input": {
"per_million_tokens": 0.272,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.088,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "LongCat-Flash-Chat",
"description": "Meituan has officially released and open-sourced LongCat-Flash-Chat, which utilizes an innovative Mixture of Experts (MoE) and \"zero-computation expert\" mechanism to achieve a total of 560B parameters, while only activating around 27B parameters per token as needed. At the same time, end-to-end optimization for agents (including a self-built evaluation set and multi-agent trajectory data) significantly enhances its performance in tool usage and complex task orchestration.",
"pricing": {
"input": {
"per_million_tokens": 0.14,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.7,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.5-pro-preview-06-05-search",
"description": "Integrated with Google's official search function.",
"capabilities": [
"REASONING",
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.31,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"thinking",
"web",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm,search",
"original_features": "thinking,web,tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "imagen-4.0-ultra-generate-exp-05-20",
"description": "Image 4.0 Beta version, for testing purposes only. For production environment, it is recommended to use imagen-4.0-generate-preview-05-20.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "Qwen2.5-VL-72B-Instruct",
"description": "The model provider is the Sophon platform. Qwen2.5-VL-72B-Instruct is the latest vision-language model released by the Qwen team. This model excels not only at recognizing common objects such as flowers, birds, fish, and insects, but also at efficiently analyzing text, charts, icons, graphics, and layouts within images. As a visual agent, it is capable of reasoning and dynamically guiding tool usage, supporting both computer and mobile operations. Moreover, it can understand videos longer than one hour and accurately locate relevant video segments.",
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 0.62,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.62,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "ernie-5.0-thinking-preview",
"description": "The new generation Wenxin model, Wenxin 5.0, is a native full-modal large model that adopts native full-modal unified modeling technology, jointly modeling text, images, audio, and video, possessing comprehensive full-modal capabilities. Wenxin 5.0's basic abilities are comprehensively upgraded, performing excellently on benchmark test sets, especially in multimodal understanding, instruction compliance, creative writing, factual accuracy, intelligent agent planning, and tool application.",
"capabilities": [
"REASONING",
"STRUCTURED_OUTPUT",
"FUNCTION_CALL"
],
"input_modalities": [
"TEXT"
],
"context_window": 183000,
"max_output_tokens": 64000,
"pricing": {
"input": {
"per_million_tokens": 0.822,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.288,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"structured_outputs",
"function_calling"
],
"original_types": "llm",
"original_features": "thinking,structured_outputs,function_calling"
}
},
{
"id": "inclusionAI/Ling-1T",
"description": "Ling-1T is the first flagship non-thinking model in the “Ling 2.0” series, featuring 1 trillion total parameters and approximately 50 billion active parameters per token. Built on the Ling 2.0 architecture, Ling-1T is designed to push the limits of efficient inference and scalable cognition. Ling-1T-base was pretrained on over 20 trillion high-quality, reasoning-intensive tokens, supports up to a 128K context length, and incorporates an Evolutionary Chain of Thought (Evo-CoT) process during mid-stage and post-stage training. This training regimen greatly enhances the models efficiency and depth of reasoning, enabling Ling-1T to achieve top performance across multiple complex reasoning benchmarks, balancing accuracy and efficiency.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.548,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.192,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "inclusionAI/Ring-1T",
"description": "Ring-1T is an open-source idea model with a trillion parameters released by the Bailing team. It is based on the Ling 2.0 architecture and the Ling-1T-base foundational model for training, with a total parameter count of 1 trillion, an active parameter count of 50 billion, and supports up to a 128K context window. The model is trained via large-scale verifiable reward reinforcement learning (RLVR), combined with the self-developed Icepop reinforcement learning stabilization method and the efficient ASystem reinforcement learning system, significantly improving the models deep reasoning and natural language reasoning capabilities. Ring-1T achieves leading performance among open-source models on high-difficulty reasoning benchmarks such as mathematics competitions (e.g., IMO 2025), code generation (e.g., ICPC World Finals 2025), and logical reasoning.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.548,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.192,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "glm-4.5-x",
"description": "GLM-4.5-X is the high-speed version of GLM-4.5, offering powerful performance with a generation speed of up to 100 tokens per second.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 2.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 8.91,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.44,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gme-qwen2-vl-2b-instruct",
"description": "The GME-Qwen2VL series is a unified multimodal Embedding model trained based on the Qwen2-VL multimodal large language model (MLLMs). The GME model supports three types of inputs: text, images, and image-text pairs. All these input types can generate universal vector representations and exhibit excellent retrieval performance.",
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 0.138,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.138,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "gte-rerank-v2",
"description": "gte-rerank-v2 is a multilingual unified text ranking model developed by Tongyi Lab, covering multiple major languages worldwide and providing high-quality text ranking services. It is typically used in scenarios such as semantic retrieval and RAG, and can simply and effectively improve text retrieval performance. Given a query and a set of candidate texts (documents), the model ranks the candidates from highest to lowest based on their semantic relevance to the query.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.11,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"rerank"
],
"original_types": "rerank"
}
},
{
"id": "inclusionAI/Ling-flash-2.0",
"description": "Ling-flash-2.0 is a language model from inclusionAI with a total of 100 billion parameters, of which 6.1 billion are activated per token (4.8 billion non-embedding). As part of the Ling 2.0 architecture series, it is designed as a lightweight yet powerful Mixture-of-Experts (MoE) model. It aims to deliver performance comparable to or even exceeding that of 40B-level dense models and other larger MoE models, but with a significantly smaller active parameter count. The model represents a strategy focused on achieving high performance and efficiency through extreme architectural design and training methods.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.136,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.544,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "inclusionAI/Ling-mini-2.0",
"description": "Ling-mini-2.0 is a small-sized, high-performance large language model based on the MoE architecture. It has a total of 16 billion parameters, but only activates 1.4 billion parameters per token (non-embedding 789 million), achieving extremely high generation speed. Thanks to the efficient MoE design and large-scale high-quality training data, despite activating only 1.4 billion parameters, Ling-mini-2.0 still demonstrates top-tier performance on downstream tasks comparable to dense LLMs under 10 billion parameters and even larger-scale MoE models.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.272,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "inclusionAI/Ring-flash-2.0",
"description": "Ring-flash-2.0 is a high-performance thinking model deeply optimized based on the Ling-flash-2.0-base. It uses a mixture-of-experts (MoE) architecture with a total of 100 billion parameters, but only activates 6.1 billion parameters per inference. The model employs the original Icepop algorithm to solve the instability issues of large MoE models during reinforcement learning (RL) training, enabling its complex reasoning capabilities to continuously improve over long training cycles. Ring-flash-2.0 has achieved significant breakthroughs on multiple high-difficulty benchmarks, including mathematics competitions, code generation, and logical reasoning. Its performance not only surpasses top dense models under 40 billion parameters but also rivals larger open-source MoE models and closed-source high-performance thinking models. Although the model focuses on complex reasoning, it also performs exceptionally well on creative writing tasks. Furthermore, thanks to its efficient architecture, Ring-flash-2.0 delivers high performance with low-latency inference, significantly reducing deployment costs in high-concurrency scenarios.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.136,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.544,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "irag-1.0",
"description": "Baidu's self-developed ERNIE iRAG (ERNIE image-based RAG), a retrieval-augmented text-to-image technology, combines Baidu Search's hundreds of millions of image resources with powerful foundational model capabilities to generate various ultra-realistic images. The overall effect far surpasses native text-to-image systems, eliminating the typical AI feel while maintaining low costs. ERNIE iRAG features no hallucinations, ultra-realism, and instant usability.",
"capabilities": [
"IMAGE_GENERATION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "jina-deepsearch-v1",
"description": "DeepSearch combines search, reading, and reasoning capabilities to pursue the best possible answer. It's fully compatible with OpenAI's Chat API format—just replace api.openai.com with aihubmix.com to get started. \nThe stream will return the thinking process.",
"capabilities": [
"REASONING",
"WEB_SEARCH"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 1000000,
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"thinking",
"web",
"deepsearch"
],
"original_types": "llm,search",
"original_features": "thinking,web,deepsearch"
}
},
{
"id": "jina-embeddings-v4",
"description": "A general-purpose vector model with 3.8 billion parameters, used for multimodal and multilingual retrieval, supporting both unidirectional and multi-vector embedding outputs.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "jina-reranker-v3",
"description": "Multimodal multilingual document reranker, 131K context, 0.6B parameters, for visual document sorting.",
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 131000,
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"rerank"
],
"original_types": "rerank"
}
},
{
"id": "llama-4-maverick",
"description": "Llama 4 Maverick is a high-capacity Mixture-of-Experts (MoE) model from Meta, featuring 400B total parameters and 128 experts, while activating an efficient 17B parameters per inference. Engineered for peak performance, it excels at advanced multimodal tasks.\n\nMaverick natively supports text and image input, producing multilingual text and code. With a 1-million-token context window and instruction tuning, it is optimized for complex image reasoning and general-purpose assistant-like interactions.\n\nReleased under the Llama 4 Community License, Maverick is ideal for research and commercial applications demanding state-of-the-art multimodal understanding and high throughput.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 1048576,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "llama-4-scout",
"description": "Llama 4 Scout is a highly efficient Mixture-of-Experts (MoE) model from Meta, activating 17B out of 109B total parameters per inference. It natively supports multimodal input (text and image) and multilingual output (text and code) across 12 languages.\n\nDesigned for assistant-style interaction and visual reasoning, Scout features a massive 10-million-token context window. It is instruction-tuned for tasks like multilingual chat and image understanding and is released under the Llama 4 Community License for local or commercial deployment.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 131000,
"max_output_tokens": 131000,
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "qwen-image",
"description": "Qwen-Image is a foundational image generation model in the Qwen series, achieving significant progress in complex text rendering and precise image editing. Experiments show that the model has strong general capabilities in image generation and editing, especially excelling in Chinese text rendering.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "qwen-image-edit",
"description": "Qwen-Image-Edit is the image editing version of Qwen-Image. Based on the 20B Qwen-Image model, Qwen-Image-Edit successfully extends Qwen-Image's unique text rendering capabilities to image editing tasks, achieving precise text editing. Additionally, Qwen-Image-Edit can input the same image into Qwen2.5-VL (for visual semantic control) and the VAE encoder (for visual appearance control), enabling both semantic and appearance editing functionalities.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "qwen-image-plus",
"description": "Qwen-Image is a foundational image generation model in the Qwen series, achieving significant progress in complex text rendering and precise image editing. Experiments show that the model has strong general capabilities in image generation and editing, especially excelling in Chinese text rendering.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "qwen-mt-plus",
"description": "Based on the comprehensive upgrade of Qwen3, this flagship translation large model supports bidirectional translation across 92 languages. It offers fully enhanced model performance and translation quality, along with more stable terminology customization, format fidelity, and domain-prompting capabilities, making translations more accurate and natural.",
"input_modalities": [
"TEXT"
],
"context_window": 16000,
"max_output_tokens": 8000,
"pricing": {
"input": {
"per_million_tokens": 0.492,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.476,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "qwen-mt-turbo",
"description": "Based on the comprehensive upgrade of Qwen3, this flagship translation large model supports bidirectional translation across 92 languages. It offers fully enhanced model performance and translation quality, along with more stable terminology customization, format fidelity, and domain-prompting capabilities, making translations more accurate and natural.",
"input_modalities": [
"TEXT"
],
"context_window": 16000,
"max_output_tokens": 8000,
"pricing": {
"input": {
"per_million_tokens": 0.192,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.534912,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "qwen3-embedding-0.6b",
"description": "The Qwen3 Embedding model series is the latest proprietary model family from Qwen, specifically designed for text embedding and ranking tasks. Based on the dense base models of the Qwen3 series, it offers comprehensive text embedding and reranking models in various sizes (0.6B, 4B, and 8B). This series inherits the excellent multilingual capabilities, long-text understanding, and reasoning skills of its base models. The Qwen3 Embedding series demonstrates significant advancements in various text embedding and ranking tasks, including text retrieval, code retrieval, text classification, text clustering, and bilingual text mining.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.068,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "qwen3-embedding-4b",
"description": "The Qwen3 Embedding model series is the latest proprietary model family from Qwen, specifically designed for text embedding and ranking tasks. Based on the dense base models of the Qwen3 series, it offers comprehensive text embedding and reranking models in various sizes (0.6B, 4B, and 8B). This series inherits the excellent multilingual capabilities, long-text understanding, and reasoning skills of its base models. The Qwen3 Embedding series demonstrates significant advancements in various text embedding and ranking tasks, including text retrieval, code retrieval, text classification, text clustering, and bilingual text mining.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.068,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "qwen3-embedding-8b",
"description": "The Qwen3 Embedding model series is the latest proprietary model family from Qwen, specifically designed for text embedding and ranking tasks. Based on the dense base models of the Qwen3 series, it offers comprehensive text embedding and reranking models in various sizes (0.6B, 4B, and 8B). This series inherits the excellent multilingual capabilities, long-text understanding, and reasoning skills of its base models. The Qwen3 Embedding series demonstrates significant advancements in various text embedding and ranking tasks, including text retrieval, code retrieval, text classification, text clustering, and bilingual text mining.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.068,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "qwen3-reranker-0.6b",
"description": "Based on the dense foundational model of the Qwen3 series, it is specifically designed for ranking tasks. It inherits the base models outstanding multilingual capabilities, long-text understanding, and reasoning skills, achieving significant advancements in ranking tasks.",
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 16000,
"max_output_tokens": 8000,
"pricing": {
"input": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.11,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"rerank"
],
"original_types": "rerank"
}
},
{
"id": "qwen3-reranker-4b",
"description": "Based on the dense foundational model of the Qwen3 series, it is specifically designed for ranking tasks. It inherits the base models outstanding multilingual capabilities, long-text understanding, and reasoning skills, achieving significant advancements in ranking tasks.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.11,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"rerank"
],
"original_types": "rerank"
}
},
{
"id": "qwen3-reranker-8b",
"description": "Based on the dense foundational model of the Qwen3 series, it is specifically designed for ranking tasks. It inherits the base models outstanding multilingual capabilities, long-text understanding, and reasoning skills, achieving significant advancements in ranking tasks.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.11,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"rerank"
],
"original_types": "rerank"
}
},
{
"id": "tao-8k",
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.068,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "bce-reranker-base",
"description": "Based on the dense foundational model of the Qwen3 series, it is specifically designed for ranking tasks. It inherits the base models outstanding multilingual capabilities, long-text understanding, and reasoning skills, achieving significant advancements in ranking tasks.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.068,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"rerank"
],
"original_types": "rerank"
}
},
{
"id": "codex-mini-latest",
"description": "Only supports v1/responses API calls.https://docs.aihubmix.com/en/api/Responses-API\ncodex-mini-latest is a fine-tuned version of o4-mini specifically for use in Codex CLI. For direct use in the API, we recommend starting with gpt-4.1.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 1.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.375,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "doubao-seedream-4-0",
"description": "Seedream 4.0 is a SOTA-level multimodal image creation model based on leading architecture. It breaks the creative boundaries of traditional text-to-image models by natively supporting text, single image, and multiple image inputs. Users can freely combine text and images to achieve various creative styles within the same model, such as multi-image fusion creation based on subject consistency, image editing, and set image generation, making image creation more flexible and controllable.\nSeedream 4.0 supports composite editing with up to 10 images in a single input. Through deep reasoning of prompt words, it automatically adapts the optimal image aspect ratio and generation quantity, enabling continuous output of up to 15 content-related images at one time. Additionally, the model significantly improves the accuracy and content diversity of Chinese generation, supports 4K ultra-high-definition output, and provides a one-stop solution from generation to editing for professional image creation.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "embedding-v1",
"description": "Embedding-V1 is a text representation model based on Baidu's Wenxin large model technology, capable of converting text into numerical vector forms for applications such as text retrieval, information recommendation, and knowledge mining. Embedding-V1 provides an Embeddings interface that generates corresponding vector representations based on the input content. By calling this interface, you can input text into the model and obtain the corresponding vector representations for subsequent text processing and analysis.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.068,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "ernie-4.5-turbo-latest",
"description": "Wenxin 4.5 Turbo also has significant improvements in hallucination reduction, logical reasoning, and coding capabilities. Compared to Wenxin 4.5, it is faster and more affordable.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 135000,
"max_output_tokens": 12000,
"pricing": {
"input": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.44,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "ernie-irag-edit",
"description": "Baidu's self-developed ERNIE iRAG Edit image editing model supports operations based on images such as erase (object removal), repaint (object redrawing), and variation (variant generation).",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT",
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation",
"tools",
"function_calling",
"structured_outputs"
],
"category": "image-generation",
"original_types": "image_generation",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "jina-clip-v2",
"description": "Multi-modal Embeddings Model, multilingual, 1024-dimensional, 865M parameters.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "jina-reranker-m0",
"description": "Multimodal multilingual document reranker, 10K context, 2.4B parameters, for visual document sorting.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"rerank"
],
"original_types": "rerank"
}
},
{
"id": "jina-colbert-v2",
"description": "Multi-language ColBERT embeddings model, 560M parameters, used for embedding and reranking.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding",
"rerank"
],
"original_types": "embedding,rerank"
}
},
{
"id": "gpt-4o-search-preview",
"description": "Using the Chat Completions API, you can directly access the fine-tuned models and tool used by Search in ChatGPT.\n\nWhen using Chat Completions, the model always retrieves information from the web before responding to your query. To use web_search_preview as a tool that models like gpt-4o and gpt-4o-mini invoke only when necessary, switch to using the Responses API.\n\nCurrently, you need to use one of these models to use web search in Chat Completions:\n\ngpt-4o-search-preview\ngpt-4o-mini-search-preview\nWeb search parameter example\nimport OpenAI from \"openai\";\nconst client = new OpenAI();\n\nconst completion = await client.chat.completions.create({\n model: \"gpt-4o-search-preview\",\n web_search_options: {},\n messages: [{\n \"role\": \"user\",\n \"content\": \"What was a positive news story from today?\"\n }],\n});\n\nconsole.log(completion.choices[0].message.content);\nOutput and citations\nThe API response item in the choices array will include:\n\nmessage.content with the text result from the model, inclusive of any inline citations\nannotations with a list of cited URLs\nBy default, the model's response will include inline citations for URLs found in the web search results. In addition to this, the url_citation annotation object will contain the URL and title of the cited source, as well as the start and end index characters in the model's response where those sources were used.",
"capabilities": [
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 128000,
"max_output_tokens": 16384,
"pricing": {
"input": {
"per_million_tokens": 2.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 1.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"web",
"function_calling",
"structured_outputs"
],
"original_types": "llm,search",
"original_features": "web,function_calling,structured_outputs"
}
},
{
"id": "DeepSeek-R1",
"description": "DeepSeek R1 is a new open-source model with performance on par with OpenAI's o1 and features fully open reasoning tokens. It is a 671B-parameter Mixture-of-Experts (MoE) model that activates 37B parameters during inference.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 1638000,
"max_output_tokens": 1638000,
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "gpt-4o-mini-search-preview",
"description": "Using the Chat Completions API, you can directly access the fine-tuned models and tool used by Search in ChatGPT.\n\nWhen using Chat Completions, the model always retrieves information from the web before responding to your query. To use web_search_preview as a tool that models like gpt-4o and gpt-4o-mini invoke only when necessary, switch to using the Responses API.\n\nCurrently, you need to use one of these models to use web search in Chat Completions:\n\ngpt-4o-search-preview\ngpt-4o-mini-search-preview\nWeb search parameter example\nimport OpenAI from \"openai\";\nconst client = new OpenAI();\n\nconst completion = await client.chat.completions.create({\n model: \"gpt-4o-search-preview\",\n web_search_options: {},\n messages: [{\n \"role\": \"user\",\n \"content\": \"What was a positive news story from today?\"\n }],\n});\n\nconsole.log(completion.choices[0].message.content);\nOutput and citations\nThe API response item in the choices array will include:\n\nmessage.content with the text result from the model, inclusive of any inline citations\nannotations with a list of cited URLs\nBy default, the model's response will include inline citations for URLs found in the web search results. In addition to this, the url_citation annotation object will contain the URL and title of the cited source, as well as the start and end index characters in the model's response where those sources were used.",
"capabilities": [
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 128000,
"max_output_tokens": 16384,
"pricing": {
"input": {
"per_million_tokens": 0.15,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"web",
"function_calling",
"structured_outputs"
],
"original_types": "llm,search",
"original_features": "web,function_calling,structured_outputs"
}
},
{
"id": "jina-embeddings-v3",
"description": "Text Embeddings Model, multilingual, 1024-dimensional, 570M parameters.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "FLUX.1-Kontext-pro",
"description": "Generate and edit images through both text and image prompts. Flux.1 Kontext is a multimodal flow matching model that enables both text-to-image generation and in-context image editing. Modify images while maintaining character consistency and performing local edits up to 8x faster than other leading models.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 40,
"currency": "USD"
},
"output": {
"per_million_tokens": 40,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "claude-3-7-sonnet",
"description": "Support for the thinking parameter through the original Claude SDK.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 3.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 16.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "ernie-4.5",
"description": "Wenxin Large Model 4.5 is a next-generation native multimodal foundational model independently developed by Baidu. It achieves collaborative optimization through joint modeling of multiple modalities, demonstrating excellent multimodal understanding capabilities; it possesses more advanced language abilities, with comprehensive improvements in comprehension, generation, logic, and memory, as well as significant enhancements in hallucination reduction, logical reasoning, and coding capabilities.ERNIE-4.5-21B-A3B is an aligned open-source model with a MoE structure, having a total of 21 billion parameters and 3 billion activated parameters.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 160000,
"max_output_tokens": 64000,
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.272,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "ernie-4.5-turbo-vl",
"description": "The new version of the Wenxin Yiyan large model significantly improves capabilities in image understanding, creation, translation, and coding. It supports a context length of up to 32K tokens for the first time, with a notable reduction in the latency of the first token.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 139000,
"max_output_tokens": 16000,
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "gemini-2.0-flash",
"description": "Gemini 2.0 Flash is Google's latest lightweight model featuring extremely low hallucination rates while maintaining fast response times, offering developers high-precision and efficient AI solutions particularly suited for applications requiring high factual accuracy.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 8192,
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "gemini-2.0-flash-preview-image-generation",
"description": "Gemini 2.0 Flash EXP is the official preview version of the drawing model. Compared to Imagen 3.0, Geminis image generation is better suited for scenarios that require contextual understanding and reasoning, rather than the pursuit of ultimate artistic performance and visual quality.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"image_generation"
],
"category": "image-generation",
"original_types": "llm,image_generation"
}
},
{
"id": "FLUX-1.1-pro",
"description": "FLUX-1.1-pro is an AI image generation tool for professional creators and content workflows. It understands complex semantic and structural instructions to deliver high consistency, multi-image coherence, and style customization from text prompts.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 40,
"currency": "USD"
},
"output": {
"per_million_tokens": 40,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "o3-mini",
"description": "OpenAI's latest fast inference model excels at STEAM tasks and offers exceptional cost-effectiveness. Official support for cache hits reduces input prices by half.",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 100000,
"pricing": {
"input": {
"per_million_tokens": 1.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 4.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.55,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking"
],
"original_types": "llm",
"original_features": "thinking"
}
},
{
"id": "doubao-seed-1-6",
"description": "Doubao-Seed-1.6 is a brand new multimodal deep reasoning model that supports four types of reasoning effort: minimal, low, medium, and high. It offers stronger model performance, serving complex tasks and challenging scenarios. It supports a 256k context window, with output length up to a maximum of 32k tokens.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 256000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.18,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.8,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.036,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinkingtools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinkingtools,function_calling,structured_outputs"
}
},
{
"id": "doubao-seed-1-6-flash",
"description": "Doubao-Seed-1.6-flash is an extremely fast multimodal deep thinking model, with TPOT requiring only 10ms. It supports both text and visual understanding, with its text comprehension skills surpassing the previous generation lite model and its visual understanding on par with competitor's pro series models. It supports a 256k context window and an output length of up to 16k tokens.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 256000,
"max_output_tokens": 33000,
"pricing": {
"input": {
"per_million_tokens": 0.044,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.44,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0088,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinkingtools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinkingtools,function_calling,structured_outputs"
}
},
{
"id": "doubao-seed-1-6-lite",
"description": "Doubao-Seed-1.6-lite is a brand new multimodal deep reasoning model that supports adjustable reasoning effort, with four modes: Minimal, Low, Medium, and High. It offers better cost performance, making it the best choice for common tasks, with a context window of up to 256k.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 256000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.082,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.656,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0164,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinkingtools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinkingtools,function_calling,structured_outputs"
}
},
{
"id": "doubao-seed-1-6-thinking",
"description": "The Doubao-Seed-1.6-thinking model has significantly enhanced reasoning capabilities. Compared with Doubao-1.5-thinking-pro, it has further improvements in fundamental abilities such as coding, mathematics, and logical reasoning, and now also supports visual understanding. It supports a 256k context window, with output length supporting up to 16k tokens.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"context_window": 256000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.18,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.8,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.036,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinkingtools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinkingtools,function_calling,structured_outputs"
}
},
{
"id": "qwen3-30b-a3b-instruct-2507",
"description": "Significantly improved performance on reasoning tasks, including logical reasoning, mathematics, science, coding, and academic benchmarks that typically require human expertise.\nMarkedly better general capabilities, such as instruction following, tool usage, text generation, and alignment with human preferences.\nEnhanced 256K long-context understanding capabilities.",
"pricing": {
"input": {
"per_million_tokens": 0.1028,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4112,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen3-30b-a3b-thinking-2507",
"description": "Significantly improved performance on reasoning tasks, including logical reasoning, mathematics, science, coding, and academic benchmarks that typically require human expertise.\nMarkedly better general capabilities, such as instruction following, tool usage, text generation, and alignment with human preferences.\nEnhanced 256K long-context understanding capabilities.",
"pricing": {
"input": {
"per_million_tokens": 0.12,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-embedding-001",
"description": "Latest version",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.15,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.15,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "gpt-oss-120b",
"description": "gpt-oss-120b is a 117B-parameter open-weight Mixture-of-Experts (MoE) language model from OpenAI, designed for high-reasoning, agentic, and general-purpose production use cases. Activating just 5.1B parameters per pass, it is optimized to run on a single H100 GPU with native MXFP4 quantization. The model features configurable reasoning depth, full chain-of-thought access, and native tool use, including function calling, browsing, and structured output generation.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 131072,
"max_output_tokens": 32768,
"pricing": {
"input": {
"per_million_tokens": 0.18,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.9,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,function_calling,structured_outputs"
}
},
{
"id": "qwen-3-235b-a22b-thinking-2507",
"description": "cerebras",
"pricing": {
"input": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "DeepSeek-R1-Distill-Qwen-32B",
"description": "The model provider is the Sophnet platform. Deepseek-R1-Distill-Qwen-32B is a knowledge-distilled large language model based on Qwen 2.5 32B and trained using outputs from DeepSeek R1.\nDeepSeek-R1 addresses issues such as infinite repetition, poor readability, and language mixing by introducing cold-start data before reinforcement learning.\nDeepSeek-R1s performance in mathematics, programming, and reasoning tasks is comparable to OpenAI-o1.\nTo support the research community, we have open-sourced DeepSeek-R1-Zero, DeepSeek-R1, and six dense models based on Llama and Qwen.\nDeepSeek-R1-Distill-Qwen-32B outperforms OpenAI-o1-mini on multiple benchmark tests, setting new state-of-the-art results for dense models.",
"pricing": {
"input": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.84,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "DeepSeek-R1-Distill-Qwen-7B",
"description": "The model provider is the Sophnet platform. DeepSeek-R1-Distill-Qwen-7B is a distilled model based on the Qwen architecture, optimized for high reasoning speed and low cost. It achieves approximately 70% of the performance of the original model at the 7B scale, while reducing response latency by 40%, making it suitable for real-time interactive scenarios.\nThe API call cost is only one-quarter of the original Qwen-7B.\nIt supports streaming output, making it suitable for applications like chatbots.\nIt achieves an accuracy of over 65% on the GSM8K math task.",
"pricing": {
"input": {
"per_million_tokens": 0.06,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.12,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "QwQ-32B",
"description": "The model provider is the Sophnet platform. QwQ is an inference model from the Qianwen series, featuring outstanding thinking and reasoning capabilities.\nCompared to traditional instruction-finetuned models, QwQ can achieve significantly enhanced performance on downstream tasks, especially on difficult problems.\nQwQ-32B is a medium-sized inference model capable of delivering competitive performance compared to state-of-the-art inference models such as DeepSeek-R1 and o1-mini.\nIt supports long context lengths of up to 128K tokens and can generate text up to 128K tokens.",
"pricing": {
"input": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.84,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen2-VL-72B-Instruct",
"description": "The model provider is the Sophnet platform. Qwen2-VL-72B-Instruct is the latest iteration in the Qwen2-VL series launched by Alibaba Cloud, representing nearly a year of innovative achievements. This model has 72 billion parameters and can understand images of various resolutions and aspect ratios. Additionally, it supports video understanding of over 20 minutes, enabling high-quality video question answering, dialogue, and content creation, along with complex reasoning and decision-making capabilities.\n\n- State-of-the-art image understanding: capable of processing images of various resolutions and aspect ratios, performing excellently across multiple visual understanding benchmarks.\n- Long video understanding: supports video comprehension exceeding 20 minutes, enabling high-quality video Q&A, dialogues, and content creation.\n- Agent operation capability: equipped with complex reasoning and decision-making abilities, it can integrate with devices such as phones and robots to perform automated operations based on visual environments and textual instructions.\n- Multilingual support: in addition to English and Chinese, it supports understanding text in images in multiple languages, including most European languages, Japanese, Korean, Arabic, Vietnamese, and more.\n- Supports a maximum context length of 128K tokens, offering powerful processing capabilities.",
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2.18,
"currency": "USD"
},
"output": {
"per_million_tokens": 6.54,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen2-VL-7B-Instruct",
"description": "The model provider is the Sophnet platform. Qwen2-VL-7B-Instruct is the latest vision-language model launched by Alibaba Cloud and the newest member of the Qwen family. This model is proficient not only in recognizing common objects but also in analyzing text, charts, icons, and layouts within images. As a visual agent, it can reason and dynamically guide tool usage, supporting operations on computers and mobile phones. Additionally, it can understand long videos exceeding one hour and capture key events, accurately locate objects in images, and generate structured outputs for data such as invoices and tables, making it suitable for various scenarios including finance and business.\n\n- Vision understanding capability: not only recognizes common objects but also analyzes text, charts, icons, and layouts within images.\n- Agent capability: functions as a visual agent capable of reasoning and dynamically guiding tool usage, supporting operations on computers and mobile phones.\n- Long video understanding: can comprehend video content over one hour in length and accurately localize relevant video segments.\n- Visual localization: precisely locates objects within images by generating bounding boxes or points, providing stable JSON coordinate outputs.\n- Structured output: supports structured data output for invoices, tables, and other data, suitable for finance, business, and various other scenarios.",
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.7,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "cc-kimi-for-coding",
"description": "for claude code",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen3-30B-A3B",
"description": "Provided by chutes.ai",
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen3-32B",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen3-14B",
"description": "Provided by chutes.ai",
"pricing": {
"input": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen3-8B",
"description": "Provided by chutes.ai",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-pro-exp-02-05-search",
"description": "Integrated with Google's official search and internet connectivity features.",
"capabilities": [
"WEB_SEARCH"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"web"
],
"original_features": "web"
}
},
{
"id": "gemini-2.5-pro-preview-06-05",
"description": "Googles latest multimodal flagship model, combining exceptional coding and reasoning capabilities. Its massive 1 million token context window (soon to expand to 2 million) places it at the top of the WebDevArena and LMArena leaderboards. It is particularly well-suited for developing aesthetically pleasing and highly functional interactive web applications, code transformation, and complex workflows. The newly introduced \"reasoning budget\" feature cleverly balances cost and performance, while optimized tool calls and response styles further enhance development efficiency, making it the ideal choice for rapid prototyping and advanced coding.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"context_window": 1048576,
"max_output_tokens": 65536,
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.31,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "Aihubmix-MAI-DS-R1",
"description": "MAI-DS-R1 is a refined version of DeepSeek-R1 by Microsoft AI, designed to improve responsiveness to previously blocked topics while enhancing safety. It integrates 110k Tulu-3 SFT samples and 350k multilingual safety-alignment examples. The model retains strong reasoning and coding abilities, surpasses R1-1776 in handling sensitive queries, and reduces harmful content leakage. Based on a transformer MoE architecture, it suits general-purpose tasks—excluding legal, medical, or autonomous systems.",
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "embedding-2",
"description": "A text vector model that converts input text information into vector representations so that, in conjunction with a vector database, it provides an external knowledge base for the large model, thereby improving the accuracy of the models reasoning.",
"input_modalities": [
"TEXT"
],
"context_window": 8000,
"pricing": {
"input": {
"per_million_tokens": 0.0686,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.0686,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "embedding-3",
"description": "A text vector model that converts input text into vector representations to work with a vector database and provide an external knowledge base for a large model. The model supports custom vector dimensions; it is recommended to choose 256, 512, 1024, or 2048 dimensions.",
"input_modalities": [
"TEXT"
],
"context_window": 8000,
"pricing": {
"input": {
"per_million_tokens": 0.0686,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.0686,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "gemini-2.0-flash-search",
"description": "Integrated with Google's official search and internet connectivity features.",
"capabilities": [
"WEB_SEARCH"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"web"
],
"original_types": "llm,search",
"original_features": "web"
}
},
{
"id": "Qwen/Qwen2.5-VL-72B-Instruct",
"description": "Qwen2.5-VL is a visual language model from the Qwen2.5 series, equipped with strong visual understanding and reasoning capabilities. It can recognize objects, analyze text and charts, understand key events in long videos, and accurately locate targets within images. The model supports structured output, making it suitable for data such as invoices and forms, and performs excellently in multiple benchmark tests.",
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "o1",
"description": "OpenAI's most powerful O-series model supports official cache hits that halve the input cost.",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 15,
"currency": "USD"
},
"output": {
"per_million_tokens": 60,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 7.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking"
],
"original_types": "llm",
"original_features": "thinking"
}
},
{
"id": "o1-pro",
"description": "The o1 series of models are trained with reinforcement learning to think before they answer and perform complex reasoning. The o1-pro model uses more compute to think harder and provide consistently better answers.",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 170,
"currency": "USD"
},
"output": {
"per_million_tokens": 680,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 170,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking"
],
"original_types": "llm",
"original_features": "thinking"
}
},
{
"id": "ByteDance-Seed/Seed-OSS-36B-Instruct",
"description": "Seed-OSS is a series of open-source large language models developed by ByteDance's Seed team, designed specifically for powerful long-context processing, reasoning, agents, and general capabilities. Among this series, Seed-OSS-36B-Instruct is an instruction-tuned model with 36 billion parameters that natively supports ultra-long context lengths, enabling it to process massive documents or complex codebases in a single pass. This model is specially optimized for reasoning, code generation, and agent tasks (such as tool usage), while maintaining balanced and excellent general capabilities. A notable feature of this model is the \"Thinking Budget\" functionality, which allows users to flexibly adjust the inference length as needed, thereby effectively improving inference efficiency in practical applications.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 256000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.534,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinkingtools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinkingtools,function_calling,structured_outputs"
}
},
{
"id": "doubao-seed-1-6-250615",
"description": "Doubao-Seed-1.6 is a brand new multimodal deep reasoning model that supports four types of reasoning effort: minimal, low, medium, and high. It offers stronger model performance, serving complex tasks and challenging scenarios. It supports a 256k context window, with output length up to a maximum of 32k tokens.",
"pricing": {
"input": {
"per_million_tokens": 0.18,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.52,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.036,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "doubao-seed-1-6-flash-250615",
"description": "Doubao-Seed-1.6-flash is an extremely fast multimodal deep thinking model, with TPOT requiring only 10ms. It supports both text and visual understanding, with its text comprehension skills surpassing the previous generation lite model and its visual understanding on par with competitor's pro series models. It supports a 256k context window and an output length of up to 16k tokens.",
"pricing": {
"input": {
"per_million_tokens": 0.044,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.44,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0088,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "doubao-seed-1-6-thinking-250615",
"description": "The Doubao-Seed-1.6-thinking model has significantly enhanced reasoning capabilities. Compared with Doubao-1.5-thinking-pro, it has further improvements in fundamental abilities such as coding, mathematics, and logical reasoning, and now also supports visual understanding. It supports a 256k context window, with output length supporting up to 16k tokens.",
"pricing": {
"input": {
"per_million_tokens": 0.18,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.52,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.036,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "doubao-seed-1-6-vision-250815",
"description": "Doubao-Seed-1.6-vision is a visual deep-thinking model that demonstrates stronger general multimodal understanding and reasoning capabilities in scenarios such as education, image moderation, inspection and security, and AI search Q&A. It supports a 256K context window and an output length of up to 64K tokens.",
"pricing": {
"input": {
"per_million_tokens": 0.10959,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.0959,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.021918,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-1.5-thinking-pro",
"description": "Doubao-1.5 is a brand-new deep thinking model that excels in specialized fields such as mathematics, programming, scientific reasoning, and general tasks like creative writing. It achieves or approaches the top-tier industry level on multiple authoritative benchmarks including AIME 2024, Codeforces, and GPQA. It supports a 128k context window and 16k output.",
"pricing": {
"input": {
"per_million_tokens": 0.62,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.48,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.62,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "cc-minimax-m2",
"description": "For Claude Code only",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "deepseek-ai/DeepSeek-Prover-V2-671B",
"description": "Provided by chutes.ai\nDeepSeek Prover V2 is a 671B parameter model, speculated to be geared towards logic and mathematics. Likely an upgrade from DeepSeek-Prover-V1.5 Not much is known about the model yet, as DeepSeek released it on Hugging Face without an announcement or description.",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemma-3-12b-it",
"description": "Gemma 3 models are multimodal, handling text and image input and generating text output, with open weights for both pre-trained variants and instruction-tuned variants. Gemma 3 has a large, 128K context window, multilingual support in over 140 languages, and is available in more sizes than previous versions. Gemma 3 models are well-suited for a variety of text generation and image understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemma-3-1b-it",
"description": "Gemma 3 models are multimodal, handling text and image input and generating text output, with open weights for both pre-trained variants and instruction-tuned variants. Gemma 3 has a large, 128K context window, multilingual support in over 140 languages, and is available in more sizes than previous versions. Gemma 3 models are well-suited for a variety of text generation and image understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemma-3-27b-it",
"description": "Gemma 3 models are multimodal, handling text and image input and generating text output, with open weights for both pre-trained variants and instruction-tuned variants. Gemma 3 has a large, 128K context window, multilingual support in over 140 languages, and is available in more sizes than previous versions. Gemma 3 models are well-suited for a variety of text generation and image understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemma-3-4b-it",
"description": "Gemma 3 models are multimodal, handling text and image input and generating text output, with open weights for both pre-trained variants and instruction-tuned variants. Gemma 3 has a large, 128K context window, multilingual support in over 140 languages, and is available in more sizes than previous versions. Gemma 3 models are well-suited for a variety of text generation and image understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemma-3n-e4b-it",
"description": "Gemma 3n is a generative AI model optimized for use in everyday devices, such as phones, laptops, and tablets. This model includes innovations in parameter-efficient processing, including Per-Layer Embedding (PLE) parameter caching and a MatFormer model architecture that provides the flexibility to reduce compute and memory requirements. These models feature audio input handling, as well as text and visual data.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4o-image-vip",
"description": "First Taste of GPT-4o's Image Generation API: Perfectly mirrors the web version's raw image creation capabilities, supporting both text-to-image and image+text-to-image generation. Each creation costs as little as $0.009.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 7,
"currency": "USD"
},
"output": {
"per_million_tokens": 7,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "gpt-4o-image",
"description": "First Taste of GPT-4o's Image Generation API: Perfectly mirrors the web version's raw image creation capabilities, supporting both text-to-image and image+text-to-image generation. Each creation costs as little as $0.005.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 3,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "deepseek-r1-distill-llama-70b",
"description": "Provided by Groq, the DeepSeek-R1-Distill model is fine-tuned based on an open-source model, using samples generated by DeepSeek-R1. We have made slight modifications to their configurations and tokenizers. Please use our settings to run these models.",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking"
],
"original_types": "llm",
"original_features": "thinking"
}
},
{
"id": "gpt-4o-mini-tts",
"description": "OpenAIs latest TTS model, gpt-4o-mini-tts, uses the same API endpoint (/v1/audio/speech) as tts-1. However, OpenAI introduced a new pricing method without providing billing details via API, causing discrepancies between official pricing and aihubmixs charges—some requests may cost more, others less. Avoid using this model if precise billing accuracy is essential.",
"input_modalities": [
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 15,
"currency": "USD"
},
"output": {
"per_million_tokens": 15,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"tts"
],
"original_types": "tts"
}
},
{
"id": "tngtech/DeepSeek-R1T-Chimera",
"description": "Provided by chutes.ai\nDeepSeek-R1T-Chimera merges DeepSeek-R1s reasoning strengths with DeepSeek-V3 (0324)s token-efficiency improvements into a MoE Transformer optimized for general text generation. It integrates pretrained weights from both models and is released under the MIT license for research and commercial use.\n",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-flash-exp",
"description": "https://doc.aihubmix.com/en/api/Gemini%20%E5%9B%BE%E7%89%87%E7%94%9F%E6%88%90%E5%92%8C%E7%BC%96%E8%BE%91\nInstructions:\n\nNeed to add parameters to experience new features: \"modalities\":[\"text\",\"image\"]\nImages are passed and output in Base64 encoding\nAs an experimental model, it's recommended to explicitly specify \"output image\", otherwise it might only output text\nDefault height for output images is 1024px\nPython calls require the latest OpenAI SDK, run pip install -U openai first",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.08,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"image_generation",
"long_context"
],
"category": "image-generation",
"original_types": "llm,image_generation",
"original_features": "long_context"
}
},
{
"id": "claude-3-5-sonnet",
"description": "Claude 3.5 Sonnet delivers performance superior to Opus and speeds faster than its predecessor, all at the same price point. Its core strengths include:\n\nCoding: Autonomously writes, edits, and executes code with advanced reasoning and troubleshooting.\nData Science: Augments human expertise by analyzing unstructured data and using multiple tools to generate insights.\nVisual Processing: Excels at interpreting charts, graphs, and images, accurately transcribing text to derive high-level insights.\nAgentic Tasks: Exceptional tool use makes it highly effective for complex, multi-step agentic workflows that interact with other systems.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 8192,
"pricing": {
"input": {
"per_million_tokens": 3.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 16.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "o1-preview",
"description": "The latest and most powerful inference model from OpenAI; AiHubMix uses both OpenAI and Microsoft Azure OpenAI channels simultaneously to achieve high-concurrency load balancing.",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 15,
"currency": "USD"
},
"output": {
"per_million_tokens": 60,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 7.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking"
],
"original_types": "llm",
"original_features": "thinking"
}
},
{
"id": "o1-mini",
"description": "o1-mini is faster and 80% cheaper, and is competitive with o1-preview on coding tasks. AiHubMix uses both OpenAI and Microsoft Azure OpenAI channels simultaneously.",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 12,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 1.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-flash-thinking-exp-01-21",
"description": "The latest version, Gemini 2.0 Flash Thinking mode, is an experimental model designed to generate the \"thought process\" that the model goes through during its responses. Therefore, Gemini 2.0 Flash Thinking mode has stronger reasoning capabilities in its responses compared to the base Gemini 2.0 Flash model.",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 0.076,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.304,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"long_context"
],
"original_types": "llm",
"original_features": "thinking,long_context"
}
},
{
"id": "gpt-4o-2024-11-20",
"description": "The latest version of the GPT-4o model; it is recommended to use this version, as it is currently smarter than the regular 4o.",
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 128000,
"max_output_tokens": 16384,
"pricing": {
"input": {
"per_million_tokens": 2.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 1.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "gpt-4o",
"description": "GPT-4o (“o” stands for “omni”) is a new-generation multimodal model designed for more natural humancomputer interaction. It can accept any combination of text, audio, image, and video as input, and generate multimodal outputs including text, audio, and images. With audio response latency as low as 232 milliseconds on average around 320 milliseconds, it approaches real human conversational speed. The model delivers strong performance in English text and code, significantly improved multilingual understanding, and outstanding capabilities in visual and audio perception, while offering faster API performance and substantially reduced cost for real-time and complex multimodal applications.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 128000,
"max_output_tokens": 16384,
"pricing": {
"input": {
"per_million_tokens": 2.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 1.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "chatgpt-4o-latest",
"description": "This model will point to the latest GPT-4o model used by ChatGPT.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 5,
"currency": "USD"
},
"output": {
"per_million_tokens": 15,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "gpt-4o-mini",
"description": "The lightweight version of GPT-4o, which is affordable and fast, suitable for handling simple tasks; our site supports the official automatic caching for this model, and charges for cache hits will be automatically halved.",
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 128000,
"max_output_tokens": 16384,
"pricing": {
"input": {
"per_million_tokens": 0.15,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "AiHubmix-mistral-medium",
"description": "Mistral Medium 3 is a SOTA & versatile model designed for a wide range of tasks, including programming, mathematical reasoning, understanding long documents, summarization, and dialogue.\n\nIt boasts multi-modal capabilities, enabling it to process visual inputs, and supports dozens of languages, including over 80 coding languages. Additionally, it features function calling and agentic workflows.\n\nMistral Medium 3 is optimized for single-node inference, particularly for long-context applications. Its size allows it to achieve high throughput on a single node.",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-pro-exp-02-05",
"description": "The latest experimental version of Gemini-2.0-Pro",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"image_generation"
],
"category": "image-generation",
"original_types": "llm,image_generation"
}
},
{
"id": "minimax-m2",
"description": "MiniMax-M2 redefines efficiency for intelligent agents. It is a compact, fast, and cost-effective MoE model with a total of 230 billion parameters and 10 billion active parameters, designed for top performance in coding and intelligent agent tasks while maintaining strong general intelligence. With only 10 billion active parameters, MiniMax-M2 delivers the complex end-to-end tool usage performance expected from today's leading models, but in a more streamlined form factor, making deployment and scaling easier than ever before.",
"input_modalities": [
"TEXT"
],
"context_window": 204800,
"max_output_tokens": 192000,
"pricing": {
"input": {
"per_million_tokens": 0.288,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.152,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "ERNIE-X1.1-Preview",
"description": "The Wenxin large model X1.1 has made significant improvements in question answering, tool invocation, intelligent agents, instruction following, logical reasoning, mathematics, and coding tasks, with notable enhancements in factual accuracy. The context length has been extended to 64K tokens, supporting longer inputs and dialogue history, which improves the coherence of long-chain reasoning while maintaining response speed.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 119000,
"max_output_tokens": 64000,
"pricing": {
"input": {
"per_million_tokens": 0.136,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.544,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "Qwen/QwQ-32B",
"description": "Silicon-based flow provision",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"pricing": {
"input": {
"per_million_tokens": 0.14,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.56,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "chutesai/Mistral-Small-3.1-24B-Instruct-2503",
"description": "Mistral's latest open-source small model; provided by chutes.ai.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "ernie-x1.1-preview",
"description": "The Wenxin large model X1.1 has made significant improvements in question answering, tool invocation, intelligent agents, instruction following, logical reasoning, mathematics, and coding tasks, with notable enhancements in factual accuracy. The context length has been extended to 64K tokens, supporting longer inputs and dialogue history, which improves the coherence of long-chain reasoning while maintaining response speed.",
"pricing": {
"input": {
"per_million_tokens": 0.136,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.544,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "MiniMaxAI/MiniMax-M1-80k",
"description": "MiniMax-M1 is an open-source large-scale hybrid attention model with 456B total parameters (45.9B activated per token). It natively supports 1M-token context and reduces FLOPs by 75% versus DeepSeek R1 in 100K-token generation tasks via lightning attention. Built on MoE architecture and optimized by CISPO algorithm, it achieves state-of-the-art performance in long-context reasoning and real-world software engineering scenarios.",
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2.5-VL-32B-Instruct",
"description": "Qwen2.5-VL-32B-Instruct is an advanced multimodal model from the Tongyi Qianwen team that can recognize objects, analyze text and graphics in images, operate tools, locate objects in images, and generate structured outputs. Through reinforcement learning, it has improved mathematics and problem-solving capabilities, with a more concise and natural response style.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 0.24,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.24,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "baidu/ERNIE-4.5-300B-A47B",
"description": "ERNIE-4.5-300B-A47B is a large language model developed by Baidu based on a Mixture of Experts (MoE) architecture. The model has a total of 300 billion parameters, but only activates 47 billion parameters per token during inference, which balances strong performance with computational efficiency. As one of the core models in the ERNIE 4.5 series, it demonstrates outstanding capabilities in tasks such as text understanding, generation, reasoning, and programming. The model employs an innovative multimodal heterogeneous MoE pretraining approach, leveraging joint training of textual and visual modalities to effectively enhance the models overall abilities, particularly excelling in instruction following and world knowledge memorization. Baidu has open-sourced this model along with other models in the series, aiming to promote the research and application of AI technology.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.32,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.28,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "bge-large-en",
"description": "bge-large-en, open-sourced by the Beijing Academy of Artificial Intelligence (BAAI), is currently the most powerful vector representation model for Chinese tasks, with its semantic representation capabilities comprehensively surpassing those of similar open-source models.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.068,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "embedding",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "bge-large-zh",
"description": "bge-large-zh, open-sourced by the Beijing Academy of Artificial Intelligence (BAAI), is currently the most powerful vector representation model for Chinese tasks, with its semantic representation capabilities comprehensively surpassing those of similar open-source models.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.068,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.068,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "embedding",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "claude-opus-4-0",
"description": "Alias \nclaude-opus-4-20250514",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 32000,
"pricing": {
"input": {
"per_million_tokens": 16.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 82.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "codestral-latest",
"description": "Mistral has launched a new code model - Codestral 25.01; https://mistral.ai/news/codestral-2501/",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "ernie-4.5-0.3b",
"description": "Wenxin Large Model 4.5 is a next-generation native multimodal foundational large model independently developed by Baidu. It achieves collaborative optimization through joint modeling of multiple modalities, demonstrating excellent multimodal understanding capabilities. The model possesses enhanced language abilities, with comprehensive improvements in understanding, generation, reasoning, and memory. It significantly reduces hallucinations and shows notable advancements in logical reasoning and coding skills.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.0136,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.0544,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "ernie-4.5-turbo-128k-preview",
"description": "Wenxin 4.5 Turbo also shows significant enhancements in reducing hallucinations, logical reasoning, and coding capabilities. Compared to Wenxin 4.5, it is faster and more cost-effective.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.108,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.432,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "ernie-x1-turbo",
"description": "Wenxin Large Model X1 possesses enhanced abilities in understanding, planning, reflection, and evolution. As a more comprehensive deep-thinking model, Wenxin X1 combines accuracy, creativity, and literary elegance, excelling particularly in Chinese knowledge Q&A, literary creation, document writing, daily conversations, logical reasoning, complex calculations, and tool invocation.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 50500,
"max_output_tokens": 28000,
"pricing": {
"input": {
"per_million_tokens": 0.136,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.544,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,tools,function_calling,structured_outputs"
}
},
{
"id": "gemini-2.0-flash-exp-search",
"description": "The gemini-2.0-flash-exp model supports internet connectivity, but the official version requires additional request parameters to enable it. Aihubmix has integrated this by automatically calling the official API's online functionality when the model name is requested with the \"search\" parameter.",
"capabilities": [
"WEB_SEARCH",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"search",
"web",
"tools",
"function_calling",
"structured_outputs",
"long_context"
],
"original_types": "llm,search",
"original_features": "web,tools,function_calling,structured_outputs,long_context"
}
},
{
"id": "kat-dev",
"description": "KAT-Dev (32B) is an open-source 32B parameter model specifically designed for software engineering tasks. It achieved a 62.4% resolution rate on the SWE-Bench Verified benchmark, ranking fifth among all open-source models of various scales. The model is optimized through multiple stages, including intermediate training, supervised fine-tuning (SFT) and reinforcement fine-tuning (RFT), as well as large-scale agent reinforcement learning (RL). Based on Qwen3-32B, its training process lays the foundation for subsequent fine-tuning and reinforcement learning stages by enhancing fundamental abilities such as tool usage, multi-turn interaction, and instruction following. During the fine-tuning phase, the model not only learns eight carefully curated task types and programming scenarios but also innovatively introduces a reinforcement fine-tuning (RFT) stage guided by human engineer-annotated “teacher trajectories.” The final agent reinforcement learning phase addresses scalability challenges through multi-level prefix caching, entropy-based trajectory pruning, and efficient architecture.",
"capabilities": [
"FUNCTION_CALL"
],
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"pricing": {
"input": {
"per_million_tokens": 0.137,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.548,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools"
],
"original_types": "llm",
"original_features": "tools"
}
},
{
"id": "llama-3.3-70b",
"description": "The Meta Llama 3.3 multilingual large language model (LLM) is a pretrained and instruction tuned generative model in 70B (text in/text out). The Llama 3.3 instruction tuned text only model is optimized for multilingual dialogue use cases and outperforms many of the available open source and closed chat models on common industry benchmarks.",
"context_window": 65536,
"max_output_tokens": 8192,
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "moonshotai/Kimi-Dev-72B",
"description": "Kimi-Dev-72B is a new generation open-source programming large model that achieved a leading performance of 60.4% on SWE-bench Verified. Through large-scale reinforcement learning optimization, it can automatically fix code in real Docker environments, receiving rewards only when passing the complete test suite, thereby ensuring the correctness and robustness of solutions and aligning more closely with real software development standards.",
"pricing": {
"input": {
"per_million_tokens": 0.32,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.28,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "moonshotai/Moonlight-16B-A3B-Instruct",
"description": "Provided by chutes.ai.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "o1-global",
"description": "OpenAI new model",
"pricing": {
"input": {
"per_million_tokens": 15,
"currency": "USD"
},
"output": {
"per_million_tokens": 60,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 7.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qianfan-qi-vl",
"description": "The Qianfan-QI-VL model is a proprietary image quality inspection and visual understanding large model (Quality Inspection Large Vision Language Model, Qianfan-QI-VL) developed by Baidu Clouds Qianfan platform. It is designed for quality inspection of product images uploaded in e-commerce scenarios, with detection capabilities including AIGC human defect detection, mosaic recognition, watermark recognition, and trademark detection.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-vl-72b-instruct",
"description": "Strong capability in Chinese domain recognition, comparable to ChatGPT-4.0.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 7.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "tencent/Hunyuan-A13B-Instruct",
"description": "Hunyuan-A13B-Instruct has 8 billion parameters and can match larger models by activating only 1.3 billion parameters, supporting \"fast thinking/slow thinking\" hybrid inference. It offers stable long text understanding. Verified by BFCL-v3 and τ-Bench, its Agent capabilities are leading in the field. Combined with GQA and multiple quantization formats, it enables efficient inference.",
"pricing": {
"input": {
"per_million_tokens": 0.14,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.56,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "unsloth/gemma-3-27b-it",
"description": "Google's latest open-source model; provided by chutes.ai",
"pricing": {
"input": {
"per_million_tokens": 0.22,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.22,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "unsloth/gemma-3-12b-it",
"description": "Provided by chutes.ai.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-exp-1206",
"description": "Google's latest experimental model, currently Google's most powerful model.",
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4o-zh",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "qwen-qwq-32b",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-max-0125",
"description": "Qwen 2.5-Max latest model",
"pricing": {
"input": {
"per_million_tokens": 0.38,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.52,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "claude-3-5-haiku",
"description": "Claude 3.5 Haiku is the next generation of Claude's fastest model.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 8192,
"pricing": {
"input": {
"per_million_tokens": 1.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 5.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "BAAI/bge-large-en-v1.5",
"description": "BAAI/bge-large-en-v1.5 is a large English text embedding model and part of the BGE (BAAI General Embedding) series. It achieves excellent performance on the MTEB benchmark, with an average score of 64.23 across 56 datasets, excelling in tasks such as retrieval, clustering, and text pair classification. The model supports a maximum input length of 512 tokens and is suitable for various natural language processing tasks, such as text retrieval and semantic similarity computation.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.034,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.034,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "embedding",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "BAAI/bge-large-zh-v1.5",
"description": "BAAI/bge-large-zh-v1.5 is a large Chinese text embedding model and part of the BGE (BAAI General Embedding) series. It performs excellently on the C-MTEB benchmark, achieving an average score of 64.53 across 31 datasets, with outstanding results in tasks such as retrieval, semantic similarity, and text pair classification. The model supports a maximum input length of 512 tokens and is suitable for various Chinese natural language processing tasks, such as text retrieval and semantic similarity computation.",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.034,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.034,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "embedding",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "BAAI/bge-reranker-v2-m3",
"description": "BAAI/bge-reranker-v2-m3 is a lightweight multilingual reranking model. It is developed based on the bge-m3 model, offering strong multilingual capabilities, easy deployment, and fast inference. The model takes a query and documents as input and directly outputs similarity scores instead of embedding vectors. It is suitable for multilingual scenarios and performs particularly well in both Chinese and English processing.",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.034,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.034,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"rerank"
],
"original_types": "rerank"
}
},
{
"id": "tencent/Hunyuan-MT-7B",
"description": "Hunyuan-MT-7B is a lightweight translation model with 7 billion parameters, designed to translate source text into target languages. The model supports translation among 33 languages as well as 5 Chinese minority languages. In the WMT25 International Machine Translation Competition, Hunyuan-MT-7B achieved first place in 30 out of 31 language categories it participated in, demonstrating its exceptional translation capabilities. For translation scenarios, Tencent Hunyuan proposed a complete training paradigm from pre-training to supervised fine-tuning, followed by translation reinforcement and ensemble reinforcement, enabling it to achieve industry-leading performance among models of similar scale. The model is computationally efficient, easy to deploy, and suitable for various application scenarios.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-flash-lite-preview-02-05",
"description": "Gemini 2.0 Flash lightweight version",
"pricing": {
"input": {
"per_million_tokens": 0.075,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "V3",
"description": "Fast and high-quality — top image quality in just 11 seconds per piece, with almost no extra time for batch generation.\nFlexible ratios — supports ultra-wide and tall formats like 3:1, 2:1, offering diverse perspectives.\nUnique strengths — outstanding design capabilities in the V3 and V2 series, with powerful text rendering (Chinese support coming soon).\nPrecise local editing — fine-tuned mask control for area redrawing (edit) and easy background replacement (replace-background).",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "sonar-reasoning",
"description": "Perplexity inference model",
"pricing": {
"input": {
"per_million_tokens": 1.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "V_2",
"description": "The Ideogram AI drawing interface is now live. This model boasts powerful text-to-image capabilities, supporting endpoints are: /generate, /remix, /edit.\nThis model is the stable V_2 version, highly recommended for editing.\nUS $0.08/ 1 IMG.\nFor usage examples and pricing details, refer to the documentation at https://docs.aihubmix.com/cn/api/IdeogramAI.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "V_2_TURBO",
"description": "The Ideogram AI drawing interface is now live. This model boasts powerful text-to-image capabilities, supporting endpoints are: /generate, /remix, /edit.\nThis model is the fast version of V_2, offering increased speed at the slight expense of quality.\nUS $0.05/ IMG.\nFor usage examples and pricing details, refer to the documentation at https://docs.aihubmix.com/cn/api/IdeogramAI.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "V_2A",
"description": "The Ideogram AI drawing interface is now live. This model boasts powerful text-to-image capabilities, supporting endpoints are: /generate, /remix.\nThis model is the fast version of V_2, faster and cheaper.\nUS $0.04/ IMG.\nFor usage examples and pricing details, refer to the documentation at https://docs.aihubmix.com/cn/api/IdeogramAI.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "V_2A_TURBO",
"description": "The Ideogram AI drawing interface is now live. This model boasts powerful text-to-image capabilities, supporting endpoints are: /generate, /remix.\nThis model is the ultra-fast version of V_2, delivering the highest speed while slightly reducing quality.\nUS $0.025/ IMG.\nFor usage examples and pricing details, refer to the documentation at https://docs.aihubmix.com/cn/api/IdeogramAI.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "V_1",
"description": "V_1 is a text-to-image model in the Ideogram series. It delivers strong text rendering capabilities, high photorealistic image quality, and precise prompt adherence. The model also introduces Magic Prompt, a new feature that automatically refines input prompts to generate more detailed and creative visuals.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "V_1_TURBO",
"description": "The Ideogram AI drawing interface is now live. This model boasts powerful text-to-image capabilities, supporting endpoints are: /generate, /remix.\nThis model is the ultra-fast version of the original V_1, offering increased speed at the slight expense of quality.\nUS $0.02/ IMG.\nFor usage examples and pricing details, refer to the documentation at https://docs.aihubmix.com/cn/api/IdeogramAI.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "doubao-embedding-large-text-240915",
"description": "doubao-embedding-large-text-240915\nDoubao Embedding is a semantic vectorization model developed by ByteDance, primarily designed for vector search scenarios. It supports both Chinese and English languages and has a maximum context length of approximately 4K tokens.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "google/gemma-3-27b-it",
"description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. Gemma 3 models are multimodal, handling text and image input and generating text output, with open weights for both pre-trained variants and instruction-tuned variants. Gemma 3 has a large, 128K context window, multilingual support in over 140 languages, and is available in more sizes than previous versions. Gemma 3 models are well-suited for a variety of text generation and image understanding tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as laptops, desktops or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone. This model is ready for commercial use.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "kimi-thinking-preview",
"description": "The latest kimi model.",
"pricing": {
"input": {
"per_million_tokens": 30,
"currency": "USD"
},
"output": {
"per_million_tokens": 30,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4o-2024-08-06",
"description": "Supports caching, with automatic halving of charges upon a cache hit.",
"pricing": {
"input": {
"per_million_tokens": 2.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 1.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-plus-2025-07-28",
"description": "The Tongyi Qianwen series balanced capability model has inference performance and speed between Tongyi Qianwen-Max and Tongyi Qianwen-Turbo, making it suitable for moderately complex tasks. This model adopts tiered pricing.",
"pricing": {
"input": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.275,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.11,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-plus-latest",
"description": "The Qwen series models with balanced capabilities have inference performance and speed between Qwen-Max and Qwen-Turbo, making them suitable for moderately complex tasks. This model is a dynamically updated version, and updates will not be announced in advance. The current version is qwen-plus-2025-04-28.The model adopts tiered pricing.",
"pricing": {
"input": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.275,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.11,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "AiHubmix-Phi-4-reasoning",
"description": "Phi-4-Reasoning is a state-of-the-art open-weight reasoning model finetuned from Phi-4 using supervised fine-tuning on a dataset of chain-of-thought traces and reinforcement learning. The supervised fine-tuning dataset includes a blend of synthetic prompts and high-quality filtered data from public domain websites, focused on math, science, and coding skills as well as alignment data for safety and Responsible AI. The goal of this approach was to ensure that small capable models were trained with data focused on high quality and advanced reasoning.",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"max_output_tokens": 4000,
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking"
],
"original_types": "llm",
"original_features": "thinking"
}
},
{
"id": "sonar",
"description": "Latest Perplexity Model",
"pricing": {
"input": {
"per_million_tokens": 1.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "stepfun-ai/step3",
"description": "Step3 is a multimodal reasoning model released by StepFun. It uses a MixtureofExperts (MoE) architecture with 321billion total parameters and 38billion activation parameters. The model follows an endtoend design that reduces decoding cost while delivering toptier performance on visionlanguage reasoning tasks. Thanks to the combined use of MultiHead Factorized Attention (MFA) and AttentionFFN Decoupling (AFD), Step3 remains highly efficient on both flagship and lowend accelerators. During pretraining, it processed over 20trillion text tokens and 4trillion imagetext mixed tokens, covering more than ten languages. On benchmarks for mathematics, code, and multimodal tasks, Step3 consistently outperforms other opensource models.",
"pricing": {
"input": {
"per_million_tokens": 1.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.75,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-embedding-v4",
"description": "This is the Tongyi Laboratory's multilingual unified text vector model trained based on Qwen3, which significantly improves performance in text retrieval, clustering, and classification compared to version V3; it achieves a 15% to 40% improvement on evaluation tasks such as MTEB multilingual, Chinese-English, and code retrieval; supports user-defined vector dimensions ranging from 64 to 2048.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.08,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.08,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "qwen-turbo-latest",
"description": "The Qwen series model with the fastest speed and lowest cost, suitable for simple tasks. This model is a dynamically updated version, and updates will not be announced in advance. The model's overall Chinese and English abilities have been significantly improved, human preference alignment has been greatly enhanced, inference capability and complex instruction understanding have been substantially strengthened, performance on difficult tasks is better, and mathematics and coding skills have been significantly improved. The current version is qwen-turbo-2025-04-28.",
"pricing": {
"input": {
"per_million_tokens": 0.046,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.92,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "AiHubmix-Phi-4-mini-reasoning",
"description": "Phi-4-mini-reasoning is a lightweight open model designed for advanced mathematical reasoning and logic-intensive problem-solving. It is particularly well-suited for tasks such as formal proofs, symbolic computation, and solving multi-step word problems. With its efficient architecture, the model balances high-quality reasoning performance with cost-effective deployment, making it ideal for educational applications, embedded tutoring, and lightweight edge or mobile systems.\n\nPhi-4-mini-reasoning supports a 128K token context length, enabling it to process and reason over long mathematical problems and proofs. Built on synthetic and high-quality math datasets, the model leverages advanced fine-tuning techniques such as supervised fine-tuning and preference modeling to enhance reasoning capabilities. Its training incorporates safety and alignment protocols, ensuring robust and reliable performance across supported use cases.",
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"max_output_tokens": 4000,
"pricing": {
"input": {
"per_million_tokens": 0.12,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.12,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "aihub-Phi-4-multimodal-instruct",
"description": "Microsoft's latest model",
"input_modalities": [
"TEXT",
"VISION",
"AUDIO"
],
"context_window": 128000,
"max_output_tokens": 4000,
"pricing": {
"input": {
"per_million_tokens": 0.12,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.48,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "qwen3-30b-a3b",
"description": "Achieves effective integration of thinking and non-thinking modes, allowing mode switching during conversations. Its reasoning ability matches that of QwQ-32B with a smaller parameter size, and its general capability significantly surpasses Qwen2.5-14B, reaching state-of-the-art (SOTA) levels among industry models of the same scale.",
"pricing": {
"input": {
"per_million_tokens": 0.12,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen3-32b",
"description": "Achieves effective integration of thinking and non-thinking modes, allowing mode switching during conversations. Its reasoning ability significantly surpasses QwQ, and its general capability significantly exceeds Qwen2.5-32B-Instruct, reaching state-of-the-art (SOTA) levels among industry models of the same scale.",
"pricing": {
"input": {
"per_million_tokens": 0.32,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "grok-3",
"description": "Grok's latest model",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 15,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihub-Phi-4-mini-instruct",
"description": "Microsoft's latest model",
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"max_output_tokens": 4000,
"pricing": {
"input": {
"per_million_tokens": 0.12,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.48,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "aihub-Phi-4",
"description": "Phi-4 is a state-of-the-art open model based on a combination of synthetic datasets, curated public domain website data, and acquired academic books and QA datasets. The approach aims to ensure that small, efficient models are trained using data focused on high quality and advanced reasoning.",
"input_modalities": [
"TEXT"
],
"context_window": 16400,
"max_output_tokens": 16400,
"pricing": {
"input": {
"per_million_tokens": 0.12,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.48,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "claude-3-opus-20240229",
"description": "Claudes previous generation strongest model",
"pricing": {
"input": {
"per_million_tokens": 16.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 82.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "dall-e-3",
"description": "dall-e-3 is an AI image generation model that converts natural language prompts into realistic visuals and artistic content. It delivers accurate semantic understanding, supports customizable output resolutions, and produces high-quality images across a wide range of styles, making it well-suited for concept design, creative prototyping, and professional content workflows.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 40,
"currency": "USD"
},
"output": {
"per_million_tokens": 40,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "doubao-embedding-text-240715",
"description": "doubao-embedding-text-240715\nDoubao Embedding is a semantic vectorization model developed by ByteDance, primarily designed for vector search scenarios. It supports both Chinese and English languages and has a maximum context length of approximately 4K tokens.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.7,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.7,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "qwen3-14b",
"description": "Achieves effective integration of thinking and non-thinking modes, enabling mode switching during conversations. Its reasoning ability reaches state-of-the-art (SOTA) levels among models of the same scale, and its general capability significantly surpasses Qwen2.5-14B.",
"pricing": {
"input": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "grok-3-beta",
"description": "Grok's latest model\nThis model ID with beta has been officially taken offline. Using this model grok-3-beta will automatically point to grok-3.",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 15,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "grok-3-fast",
"pricing": {
"input": {
"per_million_tokens": 5.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 27.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen3-8b",
"description": "Achieves effective integration of thinking and non-thinking modes, enabling mode switching during conversations. Its reasoning ability reaches state-of-the-art (SOTA) levels among models of the same scale, and its general capability significantly surpasses Qwen2.5-7B.",
"pricing": {
"input": {
"per_million_tokens": 0.08,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "qwen3-4b",
"description": "Achieves effective integration of thinking and non-thinking modes, allowing mode switching during conversations. Its reasoning ability reaches state-of-the-art (SOTA) levels among models of the same scale, with significantly enhanced human preference alignment. There are notable improvements in creative writing, role-playing, multi-turn dialogue, and instruction following, resulting in a noticeably better user experience.",
"pricing": {
"input": {
"per_million_tokens": 0.046,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.46,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "deepseek-ai/DeepSeek-R1-Zero",
"description": "Openly deployed by chutes.ai; inference with FP8; zero is the initial preliminary version of R1 without optimizations and is not recommended for use unless for research purposes.",
"pricing": {
"input": {
"per_million_tokens": 2.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "grok-3-fast-beta",
"pricing": {
"input": {
"per_million_tokens": 5.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 27.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "grok-3-mini",
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.501,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "grok-3-mini-beta",
"description": "This model ID with beta has been officially taken offline. Using this model grok-3-mini-beta will automatically point to grok-3-mini.",
"pricing": {
"input": {
"per_million_tokens": 0.33,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5511,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen3-1.7b",
"description": "Effectively integrates thinking and non-thinking modes, allowing mode switching during conversations. Its general capabilities significantly surpass those of the Qwen2.5 small-scale series, with greatly enhanced human preference alignment. There are notable improvements in creative writing, role-playing, multi-turn dialogue, and instruction following, resulting in a significantly better expected user experience.",
"pricing": {
"input": {
"per_million_tokens": 0.046,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.46,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "qwen3-0.6b",
"description": "Effectively integrates thinking and non-thinking modes, allowing mode switching during conversations. Its general capabilities significantly surpass those of the Qwen2.5 small-scale series.",
"pricing": {
"input": {
"per_million_tokens": 0.046,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.46,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "qwen-turbo-2025-04-28",
"description": "The Qwen3 series Turbo model effectively integrates thinking and non-thinking modes, allowing seamless switching between modes during conversations. With a smaller parameter size, its reasoning ability rivals that of QwQ-32B, and its general capabilities significantly surpass those of Qwen2.5-Turbo, reaching state-of-the-art (SOTA) levels among models of the same scale. This version is a snapshot model as of April 28, 2025.",
"pricing": {
"input": {
"per_million_tokens": 0.046,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.92,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "grok-3-mini-fast-beta",
"pricing": {
"input": {
"per_million_tokens": 0.33,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.20011,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-3-32b",
"description": "cerebras",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "command-a-03-2025",
"description": "Command A is Cohere most performant model to date, excelling at tool use, agents, retrieval augmented generation (RAG), and multilingual use cases. Command A has a context length of 256K, only requires two GPUs to run, and has 150% higher throughput compared to Command R+ 08-2024.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 2.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "qwen-plus-2025-04-28",
"description": "The Qwen3 series Plus model effectively integrates thinking and non-thinking modes, allowing for mode switching during conversations. Its reasoning abilities significantly surpass those of QwQ, and its general capabilities are markedly superior to Qwen2.5-Plus, reaching state-of-the-art (SOTA) levels among models of the same scale. This version is a snapshot model as of April 28, 2025.",
"pricing": {
"input": {
"per_million_tokens": 0.13,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "THUDM/GLM-Z1-32B-0414",
"description": "GLM-Z1-32B-0414 is a reasoning-focused AI model built on GLM-4-32B-0414. It has been enhanced through cold-start methods and reinforcement learning, with a strong emphasis on math, coding, and logic tasks. Despite having only 32B parameters, it performs comparably to the 671B DeepSeek-R1 on some benchmarks. It excels in complex reasoning tasks, as shown in evaluations like AIME 24/25, LiveCodeBench, and GPQA.",
"pricing": {
"input": {
"per_million_tokens": 0.08,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.08,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Pro/THUDM/GLM-4.1V-9B-Thinking",
"description": "GLM-4.1V-9B-Thinking is an open-source Vision Language Model (VLM) jointly released by Zhipu AI and the KEG Laboratory at Tsinghua University, designed specifically for handling complex multimodal cognitive tasks. Based on the GLM-4-9B-0414 foundation model, it significantly enhances cross-modal reasoning ability and stability by introducing the “Chain-of-Thought” reasoning mechanism and using reinforcement learning strategies. As a lightweight model with 9 billion parameters, it strikes a balance between deployment efficiency and performance. In 28 authoritative benchmark evaluations, it matched or even outperformed the 72-billion-parameter Qwen-2.5-VL-72B model in 18 tasks. The model excels not only in image-text understanding, mathematical and scientific reasoning, and video understanding, but also supports images up to 4K resolution and inputs of arbitrary aspect ratios.",
"pricing": {
"input": {
"per_million_tokens": 0.04,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "THUDM/GLM-4.1V-9B-Thinking",
"description": "GLM-4.1V-9B-Thinking is an open-source Vision Language Model (VLM) jointly released by Zhipu AI and the KEG Laboratory at Tsinghua University, designed specifically for handling complex multimodal cognitive tasks. Based on the GLM-4-9B-0414 foundation model, it significantly enhances cross-modal reasoning ability and stability by introducing the “Chain-of-Thought” reasoning mechanism and using reinforcement learning strategies. As a lightweight model with 9 billion parameters, it strikes a balance between deployment efficiency and performance. In 28 authoritative benchmark evaluations, it matched or even outperformed the 72-billion-parameter Qwen-2.5-VL-72B model in 18 tasks. The model excels not only in image-text understanding, mathematical and scientific reasoning, and video understanding, but also supports images up to 4K resolution and inputs of arbitrary aspect ratios.",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-embedding-004",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "THUDM/GLM-4-32B-0414",
"description": "GLM-4-32B-0414 is a next-generation open-source model with 32 billion parameters, delivering performance comparable to OpenAIs GPT series and DeepSeek V3/R1. It supports smooth local deployment.\n\nThe base model was pre-trained on 15T of high-quality data, including a large amount of reasoning-focused synthetic content, setting the stage for advanced reinforcement learning.\n\nIn the post-training phase, techniques like human preference alignment, rejection sampling, and reinforcement learning were used to improve the models ability to follow instructions, generate code, and handle function calls—core skills needed for agent-style tasks.\n\nGLM-4-32B-0414 has shown strong results in engineering code, artifact generation, function calling, search-based QA, and report writing—sometimes matching or even surpassing larger models like GPT-4o and DeepSeek-V3 (671B) on specific benchmarks.",
"pricing": {
"input": {
"per_million_tokens": 0.08,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.08,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "THUDM/GLM-Z1-9B-0414",
"description": "GLM-Z1-9B-0414 is a small but powerful model in the GLM series, with only 9 billion parameters. Despite its size, it delivers strong performance in math reasoning and general tasks, ranking among the best in its class of open-source models.\n\nTrained with the same techniques as larger models, it strikes an excellent balance between performance and efficiency—making it a great option for low-resource or lightweight deployment scenarios.",
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "THUDM/GLM-4-9B-0414",
"description": "GLM-4-9B-0414 is a lightweight model in the GLM family, with 9 billion parameters. It inherits the core tech from GLM-4-32B and offers an efficient option for deployment on limited resources.\n\nDespite its smaller size, it performs well in tasks like code generation, web design, SVG graphics creation, and search-based writing. It also supports function calling to interact with external tools, enhancing its versatility.\n\nGLM-4-9B-0414 strikes a solid balance between efficiency and performance, making it a strong choice for low-resource environments—while remaining competitive on various benchmarks.",
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "cc-doubao-seed-code-preview-latest",
"description": "claude code ",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "doubao-seed-code-preview-latest",
"description": "chat",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/Janus-Pro-7B",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-zero-preview",
"description": "Simply put, it is the intelligent enhanced version of O1.",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-3-235b-a22b-instruct-2507",
"description": "cerebras",
"pricing": {
"input": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-flash-thinking-exp-1219",
"description": "The Gemini 2.0 Flash Thinking mode is an experimental model designed to generate the \"thinking process\" that the model undergoes during its response. Therefore, the Gemini 2.0 Flash Thinking mode possesses stronger reasoning capabilities in its responses compared to the base Gemini 2.0 Flash model.",
"pricing": {
"input": {
"per_million_tokens": 0.076,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.304,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
"description": "Llama-3.1-Nemotron-Ultra-253B is a 253 billion parameter reasoning-focused language model optimized for efficiency that excels at math, coding, and general instruction-following tasks while running on a single 8xH100 node.",
"pricing": {
"input": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-4.5-air",
"input_modalities": [
"TEXT"
],
"context_window": 131072,
"max_output_tokens": 98304,
"pricing": {
"input": {
"per_million_tokens": 0.14,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.84,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-32k",
"description": "The smartest version of GPT-4; OpenAI no longer offers it officially. All the 32k versions on this site are provided by Microsoft, deployed on Azure OpenAI by the official Microsoft service.",
"pricing": {
"input": {
"per_million_tokens": 60,
"currency": "USD"
},
"output": {
"per_million_tokens": 120,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "o1-preview-2024-09-12",
"pricing": {
"input": {
"per_million_tokens": 15,
"currency": "USD"
},
"output": {
"per_million_tokens": 60,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 7.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "coding-glm-4.5-air",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.014,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.084,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/QVQ-72B-Preview",
"pricing": {
"input": {
"per_million_tokens": 1.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/QwQ-32B-Preview",
"pricing": {
"input": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.16,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.1-sonar-huge-128k-online",
"description": "On February 22, 2025, this model will be officially discontinued. The Perplexity AI official fine-tuned LLMA internet-connected interface is currently only supported at the api.aihubmix.com address.",
"pricing": {
"input": {
"per_million_tokens": 5.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 5.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.1-sonar-large-128k-online",
"description": "On February 22, 2025, this model will be officially discontinued; Perplexity AI's official fine-tuned LLMA internet-connected interface is currently only supported at the api.aihubmix.com address.",
"pricing": {
"input": {
"per_million_tokens": 1.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Mistral-Large-2411",
"description": "The latest Mistral Large 2 model is deployed on Azure.",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Mistral-large-2407",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 9,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "grok-2-1212",
"pricing": {
"input": {
"per_million_tokens": 1.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 9,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-4.5-flash",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0,
"currency": "USD"
},
"output": {
"per_million_tokens": 0,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-image-test",
"pricing": {
"input": {
"per_million_tokens": 5,
"currency": "USD"
},
"output": {
"per_million_tokens": 40,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "imagen-3.0-generate-002",
"description": "Imagen 3.0 is Google's latest text-to-image generation model, capable of creating high-quality images from natural language prompts. Compared to its predecessors, Imagen 3.0 offers significant improvements in detail, lighting, and reduced visual artifacts. It supports rendering in various artistic styles, from photorealism to impressionism, as well as abstract and anime styles.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "llama3.1-8b",
"description": "cerebras",
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "o1-2024-12-17",
"capabilities": [
"REASONING"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 15,
"currency": "USD"
},
"output": {
"per_million_tokens": 60,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 7.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking"
],
"original_types": "llm",
"original_features": "thinking"
}
},
{
"id": "sf-kimi-k2-thinking",
"pricing": {
"input": {
"per_million_tokens": 0.548,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.192,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "DESCRIBE",
"description": "This endpoint is used to describe an image.\nSupported image formats include JPEG, PNG, and WebP.\nUS $0.01/ IMG.\nFor usage examples and pricing details, refer to the documentation at https://docs.aihubmix.com/cn/api/IdeogramAI.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "UPSCALE",
"description": "The super-resolution upscale interface of the Ideogram AI drawing model is designed to enlarge low-resolution images into high-resolution ones, redrawing details (with controllable similarity and detail proportions).\nUS $0.06/ IMG.\nFor usage examples and pricing details, refer to the documentation at https://docs.aihubmix.com/cn/api/IdeogramAI.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "bai-qwen3-vl-235b-a22b-instruct",
"description": "The Qwen3 series open-source models include hybrid models, thinking models, and non-thinking models, with both reasoning capabilities and general abilities reaching industry SOTA levels at the same scale.",
"pricing": {
"input": {
"per_million_tokens": 0.274,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.096,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "cc-MiniMax-M2",
"description": "For Claude Code only",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "cc-deepseek-v3",
"description": "For Claude code only",
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "cc-deepseek-v3.1",
"description": "For Claude code only",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.56,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.68,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "cc-ernie-4.5-300b-a47b",
"description": "For Claude code only",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.32,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.28,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "cc-kimi-dev-72b",
"description": "For Claude code only",
"pricing": {
"input": {
"per_million_tokens": 0.32,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.28,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "cc-kimi-k2-instruct",
"description": "For Claude code only",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 1.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "cc-kimi-k2-instruct-0905",
"description": "For Claude code only",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 1.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "cc-kimi-k2-thinking",
"description": "Dedicated for Claude Code",
"pricing": {
"input": {
"per_million_tokens": 0.548,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.192,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "computer-use-preview",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 12,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-flash-thinking-exp",
"pricing": {
"input": {
"per_million_tokens": 0.076,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.304,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Baichuan3-Turbo",
"pricing": {
"input": {
"per_million_tokens": 1.9,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.9,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Baichuan3-Turbo-128k",
"pricing": {
"input": {
"per_million_tokens": 3.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Baichuan4",
"pricing": {
"input": {
"per_million_tokens": 16,
"currency": "USD"
},
"output": {
"per_million_tokens": 16,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Baichuan4-Air",
"pricing": {
"input": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.16,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Baichuan4-Turbo",
"pricing": {
"input": {
"per_million_tokens": 2.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "DeepSeek-v3",
"pricing": {
"input": {
"per_million_tokens": 0.272,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.088,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-1.5-lite-32k",
"description": "Doubao-1.5-lite, a brand-new generation of lightweight model, offers exceptional response speed with both performance and latency reaching world-class levels. It supports a 32k context window and an output length of up to 12k tokens.",
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.01,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-1.5-pro-256k",
"description": "Doubao-1.5-pro-256k, a fully upgraded version based on Doubao-1.5-Pro, delivers an overall performance improvement of 10%. It supports inference with a 256k context window and an output length of up to 12k tokens. With higher performance, larger window size, and exceptional cost-effectiveness, it is suitable for a wider range of application scenarios.",
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.44,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-1.5-pro-32k",
"description": "Doubao-1.5-pro, a brand-new generation of flagship model, features comprehensive performance upgrades and excels in knowledge, coding, reasoning, and other aspects. It supports a 32k context window and an output length of up to 12k tokens.",
"pricing": {
"input": {
"per_million_tokens": 0.134,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.335,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0268,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-1.5-vision-pro-32k",
"description": "Doubao-1.5-vision-pro is a newly upgraded multimodal large model that supports image recognition at any resolution and extreme aspect ratios. It enhances visual reasoning, document recognition, detailed information understanding, and instruction-following capabilities. It supports a 32k context window and an output length of up to 12k tokens.",
"pricing": {
"input": {
"per_million_tokens": 0.46,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.38,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-lite-128k",
"pricing": {
"input": {
"per_million_tokens": 0.14,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.14,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-lite-32k",
"pricing": {
"input": {
"per_million_tokens": 0.06,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.12,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.012,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-lite-4k",
"pricing": {
"input": {
"per_million_tokens": 0.06,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.12,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.06,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-pro-128k",
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.44,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-pro-256k",
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.44,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-pro-32k",
"pricing": {
"input": {
"per_million_tokens": 0.14,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.35,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.028,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Doubao-pro-4k",
"pricing": {
"input": {
"per_million_tokens": 0.14,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.35,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "GPT-OSS-20B",
"pricing": {
"input": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.55,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Gryphe/MythoMax-L2-13b",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "MiniMax-Text-01",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.14,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.12,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"long_context"
],
"original_features": "long_context"
}
},
{
"id": "Mistral-large-2407",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 9,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2-1.5B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2-57B-A14B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.24,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.24,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2-72B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2-7B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.08,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.08,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2.5-32B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2.5-72B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2.5-72B-Instruct-128K",
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2.5-7B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen/Qwen2.5-Coder-32B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.16,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Qwen3-235B-A22B-Thinking-2507",
"pricing": {
"input": {
"per_million_tokens": 0.28,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "Stable-Diffusion-3-5-Large",
"description": "Stable Diffusion 3.5 Large, developed by Stability AI, is a text-to-image generation model that supports high-quality image creation with excellent prompt responsiveness and customization, suitable for professional applications.",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 4,
"currency": "USD"
},
"output": {
"per_million_tokens": 4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "WizardLM/WizardCoder-Python-34B-V1.0",
"pricing": {
"input": {
"per_million_tokens": 0.9,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.9,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "ahm-Phi-3-5-MoE-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "ahm-Phi-3-5-mini-instruct",
"description": "Phi-3.5-mini is a lightweight, state-of-the-art open model built upon the dataset used for Phi-3—which includes synthetic data and carefully curated publicly available websites—focusing on very high-quality, reasoning-intensive data. This model is part of the Phi-3 model family and supports a context length of 128K tokens.",
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "ahm-Phi-3-5-vision-instruct",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "ahm-Phi-3-medium-128k",
"pricing": {
"input": {
"per_million_tokens": 6,
"currency": "USD"
},
"output": {
"per_million_tokens": 18,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "ahm-Phi-3-medium-4k",
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "ahm-Phi-3-small-128k",
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Codestral-2501",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Cohere-command-r",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.64,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.92,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "aihubmix-Jamba-1-5-Large",
"pricing": {
"input": {
"per_million_tokens": 2.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 8.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Llama-3-1-405B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 5,
"currency": "USD"
},
"output": {
"per_million_tokens": 15,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Llama-3-1-70B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.78,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Llama-3-1-8B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Llama-3-2-11B-Vision",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Llama-3-2-90B-Vision",
"pricing": {
"input": {
"per_million_tokens": 2.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Llama-3-70B-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.7,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.7,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-Mistral-large",
"pricing": {
"input": {
"per_million_tokens": 4,
"currency": "USD"
},
"output": {
"per_million_tokens": 12,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aihubmix-command-r-08-2024",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "aihubmix-command-r-plus",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 3.84,
"currency": "USD"
},
"output": {
"per_million_tokens": 19.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "aihubmix-command-r-plus-08-2024",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 2.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 11.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "baidu-deepseek-v3.2",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.274,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.411,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0274,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "baidu-deepseek-v3.2-exp",
"capabilities": [
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.274,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.411,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.0274,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"tools",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "tools,function_calling,structured_outputs"
}
},
{
"id": "chatglm_lite",
"pricing": {
"input": {
"per_million_tokens": 0.2858,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2858,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "chatglm_pro",
"pricing": {
"input": {
"per_million_tokens": 1.4286,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.4286,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "chatglm_std",
"pricing": {
"input": {
"per_million_tokens": 0.7144,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.7144,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "chatglm_turbo",
"pricing": {
"input": {
"per_million_tokens": 0.7144,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.7144,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "claude-2",
"pricing": {
"input": {
"per_million_tokens": 8.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 8.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "claude-2.0",
"pricing": {
"input": {
"per_million_tokens": 8.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 39.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "claude-2.1",
"pricing": {
"input": {
"per_million_tokens": 8.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 39.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "claude-3-5-sonnet-20240620",
"description": "Claude 3.5 Sonnet delivers performance superior to Opus and speeds faster than its predecessor, all at the same price point. Its core strengths include:\n\nCoding: Autonomously writes, edits, and executes code with advanced reasoning and troubleshooting.\nData Science: Augments human expertise by analyzing unstructured data and using multiple tools to generate insights.\nVisual Processing: Excels at interpreting charts, graphs, and images, accurately transcribing text to derive high-level insights.\nAgentic Tasks: Exceptional tool use makes it highly effective for complex, multi-step agentic workflows that interact with other systems.",
"input_modalities": [
"TEXT",
"VISION"
],
"context_window": 200000,
"max_output_tokens": 8192,
"pricing": {
"input": {
"per_million_tokens": 3.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 16.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "claude-3-5-sonnet@20240620",
"pricing": {
"input": {
"per_million_tokens": 3.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 16.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "claude-3-haiku-20240229",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.275,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.275,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "claude-3-haiku-20240307",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.275,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.375,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "claude-3-haiku@20240307",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.275,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.375,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "claude-3-opus@20240229",
"pricing": {
"input": {
"per_million_tokens": 16.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 82.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "claude-3-sonnet-20240229",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 3.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 16.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "claude-instant-1",
"pricing": {
"input": {
"per_million_tokens": 1.793,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.793,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "claude-instant-1.2",
"pricing": {
"input": {
"per_million_tokens": 0.88,
"currency": "USD"
},
"output": {
"per_million_tokens": 3.96,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "code-davinci-edit-001",
"pricing": {
"input": {
"per_million_tokens": 20,
"currency": "USD"
},
"output": {
"per_million_tokens": 20,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "cogview-3",
"pricing": {
"input": {
"per_million_tokens": 35.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 35.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "cogview-3-plus",
"pricing": {
"input": {
"per_million_tokens": 10,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "command",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "command-light",
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "command-light-nightly",
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "command-nightly",
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "command-r",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.64,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.92,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "command-r-08-2024",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "command-r-plus",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 3.84,
"currency": "USD"
},
"output": {
"per_million_tokens": 19.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "command-r-plus-08-2024",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 2.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 11.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "dall-e-2",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 16,
"currency": "USD"
},
"output": {
"per_million_tokens": 16,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "davinci",
"pricing": {
"input": {
"per_million_tokens": 20,
"currency": "USD"
},
"output": {
"per_million_tokens": 20,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "davinci-002",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/DeepSeek-Coder-V2-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.32,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"pricing": {
"input": {
"per_million_tokens": 0.01,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.01,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"pricing": {
"input": {
"per_million_tokens": 0.01,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.01,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
"description": "Open source deployment from SiliconFlow, the model itself is obtained through knowledge distillation.",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
"description": "Open source deployment from SiliconFlow, the model itself is obtained through knowledge distillation.",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
"description": "Open source deployment from SiliconFlow, the model itself is obtained through knowledge distillation.",
"pricing": {
"input": {
"per_million_tokens": 0.01,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.01,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/DeepSeek-V2-Chat",
"pricing": {
"input": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.32,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/DeepSeek-V2.5",
"pricing": {
"input": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.32,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/deepseek-llm-67b-chat",
"pricing": {
"input": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.16,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-ai/deepseek-vl2",
"pricing": {
"input": {
"per_million_tokens": 0.16,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.16,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-v3",
"pricing": {
"input": {
"per_million_tokens": 0.272,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.088,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "distil-whisper-large-v3-en",
"input_modalities": [
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 5.556,
"currency": "USD"
},
"output": {
"per_million_tokens": 5.556,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"stt"
],
"original_types": "stt"
}
},
{
"id": "doubao-1-5-thinking-vision-pro-250428",
"description": "Deep Thinking \nImage Understanding \nVisual Localization \nVideo Understanding \nTool Invocation \nStructured Output",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-flash-001",
"description": "Google Gemini's enterprise version VertexAI",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-flash-exp-image-generation",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.0-flash-lite",
"description": "Gemini-2.0-flash Lightweight Official Version",
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 0.076,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.304,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.076,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"long_context"
],
"original_types": "llm",
"original_features": "long_context"
}
},
{
"id": "gemini-2.0-flash-lite-001",
"description": "Google Gemini's enterprise version VertexAI",
"pricing": {
"input": {
"per_million_tokens": 0.076,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.304,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.076,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-2.5-pro-exp-03-25",
"description": "Googles latest experimental model, highly unstable, for experience only.\nIt boasts strong reasoning and coding capabilities, able to \"think\" before responding, enhancing performance and accuracy in complex tasks. It supports multimodal inputs (text, audio, images, video) and a 1 million token context window, suitable for advanced programming, math, and science tasks.\n\nThis means Gemini 2.5 can handle more complex problems in coding, science and math, and support more context-aware agents.",
"capabilities": [
"STRUCTURED_OUTPUT",
"FUNCTION_CALL"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 5,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.31,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"structured_outputs",
"tools",
"long_context"
],
"original_types": "llm",
"original_features": "structured_outputs,tools,long_context"
}
},
{
"id": "gemini-embedding-exp-03-07",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "gemini-exp-1114",
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-exp-1121",
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-pro",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemini-pro-vision",
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemma-7b-it",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gemma2-9b-it",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-3-turbo",
"pricing": {
"input": {
"per_million_tokens": 0.71,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.71,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-4",
"pricing": {
"input": {
"per_million_tokens": 14.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 14.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-4-flash",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-4-plus",
"pricing": {
"input": {
"per_million_tokens": 8,
"currency": "USD"
},
"output": {
"per_million_tokens": 8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-4.5-airx",
"description": "GLM-4.5-AirX is the high-speed version of GLM-4.5-Air, with faster response times, specifically designed for large-scale high-speed demands.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 1.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 4.51,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.22,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-4v",
"pricing": {
"input": {
"per_million_tokens": 14.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 14.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "glm-4v-plus",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "google/gemini-exp-1114",
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "google/gemma-2-27b-it",
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "google/gemma-2-9b-it:free",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-3.5-turbo",
"pricing": {
"input": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-3.5-turbo-0125",
"pricing": {
"input": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-3.5-turbo-0301",
"pricing": {
"input": {
"per_million_tokens": 1.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-3.5-turbo-0613",
"pricing": {
"input": {
"per_million_tokens": 1.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-3.5-turbo-1106",
"pricing": {
"input": {
"per_million_tokens": 1,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-3.5-turbo-16k",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-3.5-turbo-16k-0613",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-3.5-turbo-instruct",
"pricing": {
"input": {
"per_million_tokens": 1.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4",
"pricing": {
"input": {
"per_million_tokens": 30,
"currency": "USD"
},
"output": {
"per_million_tokens": 60,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-0125-preview",
"pricing": {
"input": {
"per_million_tokens": 10,
"currency": "USD"
},
"output": {
"per_million_tokens": 30,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-0314",
"pricing": {
"input": {
"per_million_tokens": 30,
"currency": "USD"
},
"output": {
"per_million_tokens": 60,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-0613",
"pricing": {
"input": {
"per_million_tokens": 30,
"currency": "USD"
},
"output": {
"per_million_tokens": 60,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-1106-preview",
"pricing": {
"input": {
"per_million_tokens": 10,
"currency": "USD"
},
"output": {
"per_million_tokens": 30,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-32k-0314",
"pricing": {
"input": {
"per_million_tokens": 60,
"currency": "USD"
},
"output": {
"per_million_tokens": 120,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-32k-0613",
"pricing": {
"input": {
"per_million_tokens": 60,
"currency": "USD"
},
"output": {
"per_million_tokens": 120,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-turbo",
"pricing": {
"input": {
"per_million_tokens": 10,
"currency": "USD"
},
"output": {
"per_million_tokens": 30,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-turbo-2024-04-09",
"pricing": {
"input": {
"per_million_tokens": 10,
"currency": "USD"
},
"output": {
"per_million_tokens": 30,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-turbo-preview",
"pricing": {
"input": {
"per_million_tokens": 10,
"currency": "USD"
},
"output": {
"per_million_tokens": 30,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4-vision-preview",
"pricing": {
"input": {
"per_million_tokens": 10,
"currency": "USD"
},
"output": {
"per_million_tokens": 30,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4o-2024-05-13",
"context_window": 128000,
"max_output_tokens": 4096,
"pricing": {
"input": {
"per_million_tokens": 5,
"currency": "USD"
},
"output": {
"per_million_tokens": 15,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4o-mini-2024-07-18",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 0.15,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "gpt-oss-20b",
"description": "gpt-oss-20b is a 21-billion parameter open-weight model released by OpenAI under the Apache 2.0 license. Its core feature is a Mixture-of-Experts (MoE) architecture that uses only 3.6B active parameters, enabling low-latency inference and deployment on consumer GPUs. The model also supports fine-tuning, function calling, tool use, and structured outputs.",
"capabilities": [
"REASONING",
"FUNCTION_CALL",
"STRUCTURED_OUTPUT"
],
"input_modalities": [
"TEXT"
],
"context_window": 128000,
"max_output_tokens": 128000,
"pricing": {
"input": {
"per_million_tokens": 0.11,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.55,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"thinking",
"function_calling",
"structured_outputs"
],
"original_types": "llm",
"original_features": "thinking,function_calling,structured_outputs"
}
},
{
"id": "grok-2-vision-1212",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 1.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 9,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "grok-vision-beta",
"input_modalities": [
"TEXT",
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 5.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 16.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "imagen-4.0-generate-preview-05-20",
"description": "Google's latest raw image model",
"capabilities": [
"IMAGE_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION"
],
"output_modalities": [
"VISION"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"image_generation"
],
"category": "image-generation",
"original_types": "image_generation"
}
},
{
"id": "jina-embeddings-v2-base-code",
"description": "Model optimized for code and document search, 768-dimensional, 137M parameters.",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.05,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.05,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "learnlm-1.5-pro-experimental",
"pricing": {
"input": {
"per_million_tokens": 1.25,
"currency": "USD"
},
"output": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.1-405b-instruct",
"pricing": {
"input": {
"per_million_tokens": 4,
"currency": "USD"
},
"output": {
"per_million_tokens": 4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.1-405b-reasoning",
"pricing": {
"input": {
"per_million_tokens": 4,
"currency": "USD"
},
"output": {
"per_million_tokens": 4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.1-70b",
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.1-70b-versatile",
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.1-8b-instant",
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.1-sonar-small-128k-online",
"description": "On February 22, 2025, this model will be officially discontinued. The Perplexity AI official fine-tuned LLMA online interface is currently supported only at the api.aihubmix.com address.",
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.2-11b-vision-preview",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.2-1b-preview",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.2-3b-preview",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama-3.2-90b-vision-preview",
"pricing": {
"input": {
"per_million_tokens": 2.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama2-70b-4096",
"pricing": {
"input": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm"
],
"original_types": "llm"
}
},
{
"id": "llama2-7b-2048",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama3-70b-8192",
"pricing": {
"input": {
"per_million_tokens": 0.7,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.937288,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama3-70b-8192(33)",
"pricing": {
"input": {
"per_million_tokens": 2.65,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.65,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama3-8b-8192",
"pricing": {
"input": {
"per_million_tokens": 0.06,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.12,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama3-8b-8192(33)",
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama3-groq-70b-8192-tool-use-preview",
"pricing": {
"input": {
"per_million_tokens": 0.00089,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.00089,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "llama3-groq-8b-8192-tool-use-preview",
"pricing": {
"input": {
"per_million_tokens": 0.00019,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.00019,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta-llama/Llama-3.2-90B-Vision-Instruct",
"pricing": {
"input": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta-llama/llama-3.1-405b-instruct:free",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta-llama/llama-3.1-70b-instruct:free",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta-llama/llama-3.1-8b-instruct:free",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta-llama/llama-3.2-11b-vision-instruct:free",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta-llama/llama-3.2-3b-instruct:free",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta/llama-3.1-405b-instruct",
"pricing": {
"input": {
"per_million_tokens": 5,
"currency": "USD"
},
"output": {
"per_million_tokens": 5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta/llama3-8B-chat",
"pricing": {
"input": {
"per_million_tokens": 0.3,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "mistralai/mistral-7b-instruct:free",
"pricing": {
"input": {
"per_million_tokens": 0.002,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.002,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "moonshot-v1-128k",
"pricing": {
"input": {
"per_million_tokens": 10,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "moonshot-v1-128k-vision-preview",
"pricing": {
"input": {
"per_million_tokens": 10,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "moonshot-v1-32k",
"pricing": {
"input": {
"per_million_tokens": 4,
"currency": "USD"
},
"output": {
"per_million_tokens": 4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "moonshot-v1-32k-vision-preview",
"pricing": {
"input": {
"per_million_tokens": 4,
"currency": "USD"
},
"output": {
"per_million_tokens": 4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "moonshot-v1-8k",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "moonshot-v1-8k-vision-preview",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "nvidia/llama-3.1-nemotron-70b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "o1-mini-2024-09-12",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 12,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 1.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "omni-moderation-latest",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-flash",
"description": "The model adopts tiered pricing.",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-flash-2025-07-28",
"description": "The model adopts tiered pricing.",
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-long",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-max",
"pricing": {
"input": {
"per_million_tokens": 0.38,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.52,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-max-longcontext",
"pricing": {
"input": {
"per_million_tokens": 7,
"currency": "USD"
},
"output": {
"per_million_tokens": 21,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-plus",
"pricing": {
"input": {
"per_million_tokens": 0.7,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen-turbo",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.36,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.08,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"long_context"
],
"original_types": "llm",
"original_features": "long_context"
}
},
{
"id": "qwen-turbo-2024-11-01",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.36,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.08,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"llm",
"long_context"
],
"original_types": "llm",
"original_features": "long_context"
}
},
{
"id": "qwen2.5-14b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-32b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-3b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-72b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-7b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-coder-1.5b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-coder-7b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-math-1.5b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-math-72b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 2.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qwen2.5-math-7b-instruct",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "sonar-reasoning-pro",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 12,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "step-2-16k",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-ada-001",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-babbage-001",
"pricing": {
"input": {
"per_million_tokens": 0.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-curie-001",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-davinci-002",
"pricing": {
"input": {
"per_million_tokens": 20,
"currency": "USD"
},
"output": {
"per_million_tokens": 20,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-davinci-003",
"pricing": {
"input": {
"per_million_tokens": 20,
"currency": "USD"
},
"output": {
"per_million_tokens": 20,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-davinci-edit-001",
"pricing": {
"input": {
"per_million_tokens": 20,
"currency": "USD"
},
"output": {
"per_million_tokens": 20,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-embedding-3-large",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.13,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.13,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "text-embedding-3-small",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.02,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.02,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "text-embedding-ada-002",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "text-embedding-v1",
"input_modalities": [
"TEXT"
],
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"embedding"
],
"original_types": "embedding"
}
},
{
"id": "text-moderation-007",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-moderation-latest",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-moderation-stable",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "text-search-ada-doc-001",
"pricing": {
"input": {
"per_million_tokens": 20,
"currency": "USD"
},
"output": {
"per_million_tokens": 20,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "tts-1",
"input_modalities": [
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 15,
"currency": "USD"
},
"output": {
"per_million_tokens": 15,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"tts"
],
"original_types": "tts"
}
},
{
"id": "tts-1-1106",
"input_modalities": [
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 15,
"currency": "USD"
},
"output": {
"per_million_tokens": 15,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"tts"
],
"original_types": "tts"
}
},
{
"id": "tts-1-hd",
"input_modalities": [
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 30,
"currency": "USD"
},
"output": {
"per_million_tokens": 30,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"tts"
],
"original_types": "tts"
}
},
{
"id": "tts-1-hd-1106",
"input_modalities": [
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 30,
"currency": "USD"
},
"output": {
"per_million_tokens": 30,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"tts"
],
"original_types": "tts"
}
},
{
"id": "veo-3",
"description": "veo3 reverse access with a total cost of just $0.41 per video generation., OpenAI chat port compatible format.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "veo3",
"description": "veo3 reverse access with a total cost of just $0.41 per video generation., OpenAI chat port compatible format.\nNote that this is a reverse interface, and charges are based on the number of requests. As long as a request is initiated, even if it returns a failure, you will be charged. If you cannot accept this, please do not use it.",
"capabilities": [
"VIDEO_GENERATION"
],
"input_modalities": [
"TEXT",
"VISION",
"AUDIO",
"VIDEO"
],
"output_modalities": [
"VIDEO"
],
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 2,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"video"
],
"category": "video-generation",
"original_types": "video"
}
},
{
"id": "whisper-1",
"description": "Ignore the displayed price on the page; the actual charge for this model request is consistent with the official, so you can use it with confidence.",
"input_modalities": [
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 100,
"currency": "USD"
},
"output": {
"per_million_tokens": 100,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"stt"
],
"original_types": "stt"
}
},
{
"id": "whisper-large-v3",
"input_modalities": [
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 30.834,
"currency": "USD"
},
"output": {
"per_million_tokens": 30.834,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"stt"
],
"original_types": "stt"
}
},
{
"id": "whisper-large-v3-turbo",
"input_modalities": [
"AUDIO"
],
"pricing": {
"input": {
"per_million_tokens": 5.556,
"currency": "USD"
},
"output": {
"per_million_tokens": 5.556,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix",
"tags": [
"stt"
],
"original_types": "stt"
}
},
{
"id": "yi-large",
"pricing": {
"input": {
"per_million_tokens": 3,
"currency": "USD"
},
"output": {
"per_million_tokens": 3,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "yi-large-rag",
"pricing": {
"input": {
"per_million_tokens": 4,
"currency": "USD"
},
"output": {
"per_million_tokens": 4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "yi-large-turbo",
"pricing": {
"input": {
"per_million_tokens": 1.8,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.8,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "yi-lightning",
"pricing": {
"input": {
"per_million_tokens": 0.2,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.2,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "yi-medium",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "yi-vl-plus",
"pricing": {
"input": {
"per_million_tokens": 0.000852,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.000852,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4o-2024-08-06-global",
"pricing": {
"input": {
"per_million_tokens": 2.5,
"currency": "USD"
},
"output": {
"per_million_tokens": 10,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 1.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "gpt-4o-mini-global",
"pricing": {
"input": {
"per_million_tokens": 0.15,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.075,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta-llama-3-70b",
"pricing": {
"input": {
"per_million_tokens": 4.795,
"currency": "USD"
},
"output": {
"per_million_tokens": 4.795,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "meta-llama-3-8b",
"pricing": {
"input": {
"per_million_tokens": 0.548,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.548,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "o3-global",
"pricing": {
"input": {
"per_million_tokens": 2,
"currency": "USD"
},
"output": {
"per_million_tokens": 8,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.5,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "o3-mini-global",
"pricing": {
"input": {
"per_million_tokens": 1.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 4.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.55,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "o3-pro-global",
"pricing": {
"input": {
"per_million_tokens": 20,
"currency": "USD"
},
"output": {
"per_million_tokens": 80,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qianfan-chinese-llama-2-13b",
"pricing": {
"input": {
"per_million_tokens": 0.822,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.822,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "qianfan-llama-vl-8b",
"pricing": {
"input": {
"per_million_tokens": 0.274,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.685,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aistudio_gemini-2.0-flash",
"pricing": {
"input": {
"per_million_tokens": 0.1,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.25,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "aistudio_gpt-4.1-mini",
"pricing": {
"input": {
"per_million_tokens": 0.4,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.6,
"currency": "USD"
},
"cache_read": {
"per_million_tokens": 0.1,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "deepseek-r1-distill-qianfan-llama-8b",
"pricing": {
"input": {
"per_million_tokens": 0.137,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.548,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "doubao-1-5-pro-256k-250115",
"pricing": {
"input": {
"per_million_tokens": 0.684,
"currency": "USD"
},
"output": {
"per_million_tokens": 1.2312,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
},
{
"id": "doubao-1-5-pro-32k-250115",
"pricing": {
"input": {
"per_million_tokens": 0.108,
"currency": "USD"
},
"output": {
"per_million_tokens": 0.27,
"currency": "USD"
}
},
"metadata": {
"source": "aihubmix"
}
}
]
}