Source code for llmsql.inference.inference_vllm

"""
LLMSQL vLLM Inference Function
==============================

This module provides a single function `inference_vllm()` that performs
text-to-SQL generation using large language models via the vLLM backend.

Example
-------

.. code-block:: python

    from llmsql.inference import inference_vllm

    results = inference_vllm(
        model_name="Qwen/Qwen2.5-1.5B-Instruct",
        output_file="outputs/predictions.jsonl",
        questions_path="data/questions.jsonl",
        tables_path="data/tables.jsonl",
        num_fewshots=5,
        batch_size=8,
        max_new_tokens=256,
        temperature=0.7,
        tensor_parallel_size=1,
    )

Notes
~~~~~

This function uses the vLLM backend. Outputs may differ from the Transformers
backend due to differences in implementation, batching, and numerical precision.

"""

from __future__ import annotations

import os

os.environ["VLLM_USE_V1"] = "0"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"

from pathlib import Path
from typing import Any

from dotenv import load_dotenv
from tqdm import tqdm
from vllm import LLM, SamplingParams

from llmsql.config.config import DEFAULT_WORKDIR_PATH
from llmsql.loggers.logging_config import log
from llmsql.utils.inference_utils import _maybe_download, _setup_seed
from llmsql.utils.utils import (
    choose_prompt_builder,
    load_jsonl,
    overwrite_jsonl,
    save_jsonl_lines,
)

load_dotenv()


[docs] def inference_vllm( model_name: str, *, # === Model Loading Parameters === trust_remote_code: bool = True, tensor_parallel_size: int = 1, hf_token: str | None = None, llm_kwargs: dict[str, Any] | None = None, use_chat_template: bool = True, # === Generation Parameters === max_new_tokens: int = 256, temperature: float = 1.0, do_sample: bool = True, sampling_kwargs: dict[str, Any] | None = None, # === Benchmark Parameters === output_file: str = "llm_sql_predictions.jsonl", questions_path: str | None = None, tables_path: str | None = None, workdir_path: str = DEFAULT_WORKDIR_PATH, num_fewshots: int = 5, batch_size: int = 8, seed: int = 42, ) -> list[dict[str, str]]: """ Run SQL generation using vLLM. Args: model_name: Hugging Face model name or path. # Model Loading: trust_remote_code: Whether to trust remote code (default: True). tensor_parallel_size: Number of GPUs for tensor parallelism (default: 1). hf_token: Hugging Face authentication token. llm_kwargs: Additional arguments for vllm.LLM(). Note: 'model', 'tokenizer', 'tensor_parallel_size', 'trust_remote_code' are handled separately and will override values here. # Generation: max_new_tokens: Maximum tokens to generate per sequence. temperature: Sampling temperature (0.0 = greedy). do_sample: Whether to use sampling vs greedy decoding. sampling_kwargs: Additional arguments for vllm.SamplingParams(). Note: 'temperature', 'max_tokens' are handled separately and will override values here. # Benchmark: output_file: Path to write outputs (will be overwritten). questions_path: Path to questions.jsonl (auto-downloads if missing). tables_path: Path to tables.jsonl (auto-downloads if missing). workdir_path: Directory to store downloaded data. num_fewshots: Number of few-shot examples (0, 1, or 5). batch_size: Number of questions per generation batch. seed: Random seed for reproducibility. Returns: List of dicts containing `question_id` and generated `completion`. """ # --- setup --- llm_kwargs = llm_kwargs or {} sampling_kwargs = sampling_kwargs or {} _setup_seed(seed=seed) hf_token = hf_token or os.environ.get("HF_TOKEN") workdir = Path(workdir_path) workdir.mkdir(parents=True, exist_ok=True) # --- load input data --- log.info("Preparing questions and tables...") questions_path = _maybe_download("questions.jsonl", questions_path) tables_path = _maybe_download("tables.jsonl", tables_path) questions = load_jsonl(questions_path) tables_list = load_jsonl(tables_path) tables = {t["table_id"]: t for t in tables_list} # --- init model --- llm_init_args = { "model": model_name, "tokenizer": model_name, "tensor_parallel_size": tensor_parallel_size, "trust_remote_code": trust_remote_code, **llm_kwargs, # User kwargs come first, but explicit params above will override } log.info(f"Loading vLLM model '{model_name}' (tp={tensor_parallel_size})...") llm = LLM(**llm_init_args) tokenizer = llm.get_tokenizer() if use_chat_template: use_chat_template = getattr(tokenizer, "chat_template", None) # type: ignore # --- prepare output file --- overwrite_jsonl(output_file) log.info(f"Output will be written to {output_file}") # --- prompt builder and sampling params --- prompt_builder = choose_prompt_builder(num_fewshots) effective_temperature = 0.0 if not do_sample else temperature sampling_params_args = { "temperature": effective_temperature, "max_tokens": max_new_tokens, **sampling_kwargs, } sampling_params = SamplingParams(**sampling_params_args) # --- main inference loop --- all_results: list[dict[str, str]] = [] total = len(questions) for batch_start in tqdm(range(0, total, batch_size), desc="Generating"): batch = questions[batch_start : batch_start + batch_size] prompts = [] for q in batch: tbl = tables[q["table_id"]] example_row = tbl["rows"][0] if tbl["rows"] else [] raw_text = prompt_builder( q["question"], tbl["header"], tbl["types"], example_row ) if use_chat_template: messages = [{"role": "user", "content": raw_text}] final_prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) else: final_prompt = raw_text prompts.append(final_prompt) outputs = llm.generate(prompts, sampling_params) batch_results: list[dict[str, str]] = [] for q, out in zip(batch, outputs, strict=False): text = out.outputs[0].text batch_results.append( { "question_id": q.get("question_id", q.get("id", "")), "completion": text, } ) save_jsonl_lines(output_file, batch_results) all_results.extend(batch_results) log.info( f"Saved batch {batch_start // batch_size + 1}: {len(all_results)}/{total}" ) log.info(f"Generation completed. {len(all_results)} results saved to {output_file}") return all_results