[build-system] requires = ["setuptools>=61.0", "wheel"] build-backend = "setuptools.build_meta" [project] name = "rapid-mlx" version = "0.6.49" description = "Rapid-MLX — AI inference for Apple Silicon. Drop-in OpenAI API, 2-4x faster than Ollama." readme = "README.md" license = {text = "Apache-2.0"} requires-python = ">=3.10" authors = [ {name = "vllm-mlx contributors"} ] keywords = ["llm", "mlx", "apple-silicon", "vllm", "inference", "transformers"] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Operating System :: MacOS", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ # Core — these are all you need for `rapid-mlx serve ` "mlx>=0.29.0", "mlx-lm>=0.31.0", # 0.31+ required for ArraysCache native batching (hybrid models) # mlx-vlm is opt-in via [vision] extras (saves ~322 MB for text-only users) "transformers>=5.0.0", # mlx-lm 0.30.5+ requires transformers 5.0 (rc3 bug fixed in stable) "tokenizers>=0.19.0", "huggingface-hub>=0.23.0", "numpy>=1.24.0", "pillow>=10.0.0", "tqdm>=4.66.0", "pyyaml>=6.0", "requests>=2.28.0", "tabulate>=0.9.0", "psutil>=5.9.0", "fastapi>=0.100.0", "uvicorn>=0.23.0", "mcp>=1.0.0", "jsonschema>=4.0.0", ] [project.optional-dependencies] # Vision/multimodal models (Gemma 4, Qwen-VL, etc.) — adds ~322 MB # Required for any model with vision input. Text-only models work without this. # 0.5.0+ also unlocks DFlash speculative decoding (see [dflash] extras). vision = [ "mlx-vlm>=0.5.0", # 0.5.0: gemma4 multi-image + tool-parser fixes, TurboQuant race-condition fix, continuous-batching guard. Also unlocks DFlash spec-decode hooks (see [dflash] extras). "opencv-python>=4.8.0", "torch>=2.3.0", "torchvision>=0.18.0", ] # DFlash speculative decoding for Qwen3.5/3.6 dense 8-bit models # (issue #264). Adds ~1-4 GB at runtime (drafter weights) and depends on # mlx-vlm's spec-decode runtime. Text-only — does NOT pull torch/cv2 like # the vision extras do. Only enable if you serve a DFlash-eligible alias # (e.g. qwen3.5-27b-8bit) with --enable-dflash. dflash = [ "mlx-vlm>=0.5.0", ] # Embedding endpoint embeddings = [ "mlx-embeddings>=0.0.5", ] # Gradio chat UI chat = [ "gradio>=4.0.0", "pytz>=2024.1", ] # All extras — union of vision + chat + embeddings. # Expanded directly to avoid a self-dependency that breaks # pip install .[all] and editable installs. all = [ # vision "mlx-vlm>=0.5.0", "opencv-python>=4.8.0", "torch>=2.3.0", "torchvision>=0.18.0", # chat "gradio>=4.0.0", "pytz>=2024.1", # embeddings "mlx-embeddings>=0.0.5", # dflash (text-only path covered by mlx-vlm above; listed for clarity) ] dev = [ "pytest>=7.0.0", "pytest-asyncio>=0.21.0", "black>=23.0.0", "ruff>=0.1.0", "mypy>=1.0.0", ] vllm = [ "vllm>=0.4.0", ] # Guided decoding with outlines for structured JSON output guided = [ "outlines[mlxlm]>=1.0.0", ] # Audio dependencies for TTS/STT (mlx-audio) audio = [ "mlx-audio>=0.2.9", "sounddevice>=0.4.0", "soundfile>=0.12.0", "scipy>=1.10.0", "numba>=0.57.0", "tiktoken>=0.5.0", "misaki[zh,ja]>=0.5.0", # Chinese (zh) and Japanese (ja) support "spacy>=3.7.0", "num2words>=0.5.0", "loguru>=0.7.0", "phonemizer>=3.2.0", # Additional multilingual dependencies "ordered_set>=4.1.0", # Required for Chinese TTS "cn2an>=0.5.0", # Chinese number conversion "fugashi>=1.3.0", # Japanese tokenizer "unidic-lite>=1.0.0", # Japanese dictionary for fugashi "jieba>=0.42.0", # Chinese word segmentation ] [project.urls] Homepage = "https://github.com/raullenchai/Rapid-MLX" Documentation = "https://github.com/raullenchai/Rapid-MLX#readme" Repository = "https://github.com/raullenchai/Rapid-MLX" [project.entry-points."vllm.platform_plugins"] mlx = "vllm_mlx.plugin:mlx_platform_plugin" [project.scripts] rapid-mlx = "vllm_mlx.cli:main" vllm-mlx = "vllm_mlx.cli:main" vllm-mlx-chat = "vllm_mlx.gradio_app:main" vllm-mlx-bench = "vllm_mlx.benchmark:main" [tool.setuptools.package-data] vllm_mlx = [ "aliases.json", "agents/profiles/*.yaml", # Integration tests bundled inside the package so `rapid-mlx agents # --test` works on pip/brew installs (the source layout keeps # them at tests/integrations/, mirrored here via symlinks). Without # this, the test runner shows a confusing path-error skip. "_integration_tests/*.py", "_integration_tests/*.sh", ] [tool.setuptools.packages.find] where = ["."] include = ["vllm_mlx*"] [tool.black] line-length = 88 target-version = ["py310", "py311", "py312", "py313"] [tool.ruff] line-length = 88 [tool.ruff.format] # Vendored upstream files — keep formatting identical to the source PR # so future syncs produce a clean diff. exclude = ["vllm_mlx/models/deepseek_v4.py"] [tool.ruff.lint] select = ["E", "F", "W", "I", "N", "UP", "B", "SIM"] ignore = [ "E402", # module-level import not at top "E501", # line too long "E731", # lambda assignment "F811", # redefined unused "F841", # unused variable "B004", # unreliable callable check "B007", # unused loop variable "B008", # function call in default arg "B011", # assert false "B017", # assert raises exception "B905", # zip without strict "N801", # class name casing "N806", # variable in function casing "SIM103", # needless bool "SIM108", # ternary instead of if-else "SIM110", # reimplemented builtin "SIM222", # expr or true "UP028", # yield in for loop "SIM102", # collapsible if "SIM105", # suppressible exception "SIM115", # open file with context handler "B904", # raise without from inside except ] [tool.ruff.lint.per-file-ignores] # Vendored from ml-explore/mlx-lm PR #1192 (Blaizzy). Keep as-is so # upstream syncs remain a clean diff. Drop this once mlx-lm 0.32+ # ships native deepseek_v4 support and we delete the vendored copy. "vllm_mlx/models/deepseek_v4.py" = ["UP", "B", "SIM", "N", "F", "I", "E"] # Tensor-shape parameter names (B, H, N, D) follow ML literature # convention; N803 lowercase rule is incompatible. "scripts/bench_attention.py" = ["N803"] [tool.mypy] python_version = "3.10" warn_return_any = true warn_unused_configs = true ignore_missing_imports = true [tool.pytest.ini_options] testpaths = ["tests"] python_files = ["test_*.py"] asyncio_mode = "auto"