模型

包含 Outlines 中所有集成模型的模块。

我们按提供者而不是主题（补全、聊天补全、扩散模型等）将模型分组到子模块中，并在代码库的其他地方使用路由函数。

`exllamav2`

`ExLlamaV2Model`

表示一个 exl2 模型。

源代码位于 outlines/models/exllamav2.py

class ExLlamaV2Model:
    """Represents a `exl2` model."""

    def __init__(
        self,
        generator: "ExLlamaV2DynamicGenerator",
        tokenizer: "OutlinesExLlamaV2Tokenizer",
        max_seq_len: int,
    ):
        self.generator = generator
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def prepare_generation_parameters(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        sampling_parameters: SamplingParameters,
        structure_logits_processor,
        **exllamav2_params: Unpack[ExllamaV2Params],
    ) -> Tuple[ExllamaV2Params, Union[str, List[str]]]:
        """Prepare the generation parameters.

        `exllamav2` uses different default values

        """
        from exllamav2.generator import ExLlamaV2Sampler

        if isinstance(prompts, str):
            prompts = [prompts]
        max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

        if max_tokens is None:
            max_tokens = []
            for prompt in prompts:
                ids = self.generator.tokenizer.encode(
                    prompt, encode_special_tokens=True
                )
                prompt_tokens = ids.shape[-1]
                max_tokens.append(self.max_seq_len - prompt_tokens)
            exllamav2_params["max_new_tokens"] = max_tokens
        else:
            exllamav2_params["max_new_tokens"] = [
                max_tokens for _ in range(len(prompts))
            ]

        stop_conditions = [self.generator.tokenizer.eos_token_id]
        if isinstance(generation_parameters.stop_at, str):
            stop_conditions.append(generation_parameters.stop_at)
        elif isinstance(generation_parameters.stop_at, list):
            for stop_at in generation_parameters.stop_at:
                stop_conditions.append(stop_at)
        exllamav2_params["stop_conditions"] = stop_conditions
        exllamav2_params["seed"] = seed

        gen_settings = ExLlamaV2Sampler.Settings()
        if sampling_parameters.temperature is not None:
            gen_settings.temperature = sampling_parameters.temperature
        if sampling_parameters.top_p is not None:
            gen_settings.top_p = sampling_parameters.top_p
        if sampling_parameters.top_k is not None:
            gen_settings.top_k = sampling_parameters.top_k
        gen_settings.logits_processor = structure_logits_processor
        exllamav2_params["gen_settings"] = gen_settings
        if sampling_parameters.num_samples > 1:
            prompts = prompts * sampling_parameters.num_samples
            exllamav2_params["max_new_tokens"] = (
                exllamav2_params["max_new_tokens"] * sampling_parameters.num_samples
            )

        if len(prompts) == 1:
            prompts = prompts[0]

        return exllamav2_params, prompts

    def reformat_output(
        self, output: Union[str, List[str]], sampling_parameters: SamplingParameters
    ):
        """
        The purpose of this function is to reformat the output from exllamav2's output format to outline's output format.

        For exllamav2, it mainly accepts only a list or a string(they also do cfg sampling with tuples but we will ignore this for now).
        The exllamav2's logic is:

        1. If the prompt is a string, return a string. This is the same as outlines
        2. If a prompt is a list, return a list. This is not the same as outlines output in that if the list is only one element, the string is expected to be outputted.
        3. There is no such thing as num_samples, so the prompts had to be duplicated by num_samples times. Then, we had the function output a list of lists
        """
        if isinstance(output, str):
            return output
        if len(output) == 1:
            return output[0]
        if sampling_parameters.num_samples > 1:
            if len(output) == sampling_parameters.num_samples:
                return output
            assert len(output) % sampling_parameters.num_samples == 0
            num_items_per_sample = len(output) // sampling_parameters.num_samples
            new_output = []
            for i in range(sampling_parameters.num_samples):
                curr_sample = []
                for j in range(num_items_per_sample):
                    curr_sample.append(output[i * num_items_per_sample + j])
                new_output.append(curr_sample)
            return new_output
        return output

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        structure_logits_processor,
        sampling_parameters: SamplingParameters,
        **exllamav2_params: Unpack[ExllamaV2Params],
    ) -> Union[str, List[str]]:
        exllamav2_params, prompts = self.prepare_generation_parameters(
            prompts,
            generation_parameters,
            sampling_parameters,
            structure_logits_processor,
        )
        """
        In exllamav2, it needs the max amount of new tokens generated.
        The reason exllamav2_params["max_new_tokens"] is a list is because in prepare_generation_parameters
        the max amount of tokens that can be generated by the model for each prompt(by encoding with tokenizer) is calculated.
        The minimum is picked because otherwise it might be possible for one of the
        prompts to exceed the max sequence length.
        """
        output = self.generator.generate(
            prompt=prompts,
            gen_settings=exllamav2_params["gen_settings"],
            max_new_tokens=min(exllamav2_params["max_new_tokens"]),
            completion_only=True,
            encode_special_tokens=True,
            stop_conditions=exllamav2_params["stop_conditions"],
            add_bos=False,
            seed=exllamav2_params["seed"],
        )

        return self.reformat_output(output, sampling_parameters)

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        structure_logits_processor,
        sampling_parameters: SamplingParameters,
        **exllamav2_params: Unpack[ExllamaV2Params],
    ) -> Iterator[Union[str, List[str]]]:
        from exllamav2.generator import ExLlamaV2DynamicJob

        exllamav2_params, prompts = self.prepare_generation_parameters(
            prompts,
            generation_parameters,
            sampling_parameters,
            structure_logits_processor,
        )

        order = {}
        if isinstance(prompts, str):
            prompts = [prompts]
        batch_size = len(prompts)
        seed = exllamav2_params["seed"]
        for idx, p in enumerate(prompts):
            input_ids = self.generator.tokenizer.encode(
                p, encode_special_tokens=True, add_bos=False
            )

            job = ExLlamaV2DynamicJob(
                input_ids=input_ids,
                max_new_tokens=exllamav2_params["max_new_tokens"][idx],
                min_new_tokens=0,
                seed=seed,
                stop_conditions=exllamav2_params["stop_conditions"],
                gen_settings=exllamav2_params["gen_settings"],
                token_healing=False,
                decode_special_tokens=False,
            )

            if seed is not None:
                seed += 1

            serial = self.generator.enqueue(job)
            order[serial] = idx

        # Collect outputs until all jobs finish

        next_text = [""] * batch_size

        def token_generator() -> Iterator[str]:
            while self.generator.num_remaining_jobs():
                results = self.generator.iterate()
                for r in results:
                    idx = order[r["serial"]]
                    if r["stage"] == "streaming":
                        text = r.get("text", "")
                        next_text[idx] = text
                    if r["eos"]:
                        next_text[idx] = ""
                yield self.reformat_output(next_text, sampling_parameters)
            return

        return token_generator()

`prepare_generation_parameters(prompts, generation_parameters, sampling_parameters, structure_logits_processor, **exllamav2_params)`

准备生成参数。

exllamav2 使用不同的默认值

源代码位于 outlines/models/exllamav2.py

def prepare_generation_parameters(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    sampling_parameters: SamplingParameters,
    structure_logits_processor,
    **exllamav2_params: Unpack[ExllamaV2Params],
) -> Tuple[ExllamaV2Params, Union[str, List[str]]]:
    """Prepare the generation parameters.

    `exllamav2` uses different default values

    """
    from exllamav2.generator import ExLlamaV2Sampler

    if isinstance(prompts, str):
        prompts = [prompts]
    max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

    if max_tokens is None:
        max_tokens = []
        for prompt in prompts:
            ids = self.generator.tokenizer.encode(
                prompt, encode_special_tokens=True
            )
            prompt_tokens = ids.shape[-1]
            max_tokens.append(self.max_seq_len - prompt_tokens)
        exllamav2_params["max_new_tokens"] = max_tokens
    else:
        exllamav2_params["max_new_tokens"] = [
            max_tokens for _ in range(len(prompts))
        ]

    stop_conditions = [self.generator.tokenizer.eos_token_id]
    if isinstance(generation_parameters.stop_at, str):
        stop_conditions.append(generation_parameters.stop_at)
    elif isinstance(generation_parameters.stop_at, list):
        for stop_at in generation_parameters.stop_at:
            stop_conditions.append(stop_at)
    exllamav2_params["stop_conditions"] = stop_conditions
    exllamav2_params["seed"] = seed

    gen_settings = ExLlamaV2Sampler.Settings()
    if sampling_parameters.temperature is not None:
        gen_settings.temperature = sampling_parameters.temperature
    if sampling_parameters.top_p is not None:
        gen_settings.top_p = sampling_parameters.top_p
    if sampling_parameters.top_k is not None:
        gen_settings.top_k = sampling_parameters.top_k
    gen_settings.logits_processor = structure_logits_processor
    exllamav2_params["gen_settings"] = gen_settings
    if sampling_parameters.num_samples > 1:
        prompts = prompts * sampling_parameters.num_samples
        exllamav2_params["max_new_tokens"] = (
            exllamav2_params["max_new_tokens"] * sampling_parameters.num_samples
        )

    if len(prompts) == 1:
        prompts = prompts[0]

    return exllamav2_params, prompts

`reformat_output(output, sampling_parameters)`

此函数的目的是将 exllamav2 的输出格式重新格式化为 outlines 的输出格式。

对于 exllamav2，它主要只接受列表或字符串（它们也用元组进行 cfg 采样，但目前我们先忽略这一点）。exllamav2 的逻辑是

如果提示是字符串，则返回字符串。这与 outlines 相同
如果提示是列表，则返回列表。这与 outlines 的输出不同，在 outlines 中，如果列表只有一个元素，则预期输出字符串。
没有 num_samples 这个概念，因此提示需要根据 num_samples 的值重复多次。然后，函数会输出一个列表的列表。

源代码位于 outlines/models/exllamav2.py

def reformat_output(
    self, output: Union[str, List[str]], sampling_parameters: SamplingParameters
):
    """
    The purpose of this function is to reformat the output from exllamav2's output format to outline's output format.

    For exllamav2, it mainly accepts only a list or a string(they also do cfg sampling with tuples but we will ignore this for now).
    The exllamav2's logic is:

    1. If the prompt is a string, return a string. This is the same as outlines
    2. If a prompt is a list, return a list. This is not the same as outlines output in that if the list is only one element, the string is expected to be outputted.
    3. There is no such thing as num_samples, so the prompts had to be duplicated by num_samples times. Then, we had the function output a list of lists
    """
    if isinstance(output, str):
        return output
    if len(output) == 1:
        return output[0]
    if sampling_parameters.num_samples > 1:
        if len(output) == sampling_parameters.num_samples:
            return output
        assert len(output) % sampling_parameters.num_samples == 0
        num_items_per_sample = len(output) // sampling_parameters.num_samples
        new_output = []
        for i in range(sampling_parameters.num_samples):
            curr_sample = []
            for j in range(num_items_per_sample):
                curr_sample.append(output[i * num_items_per_sample + j])
            new_output.append(curr_sample)
        return new_output
    return output

`exl2(model_path, draft_model_path=None, max_seq_len=None, cache_q4=False, paged=True, max_chunk_size=None)`

加载 ExLlamaV2 模型。

参数

名称	类型	描述	默认值
`model_path`	`str`	模型目录的路径。	必需
`device`		加载模型的设备。对于 GPU 传入 'cuda'，对于 CPU 传入 'cpu'	必需
`max_seq_len`	`Optional[int]`	最大序列长度。默认为 None。	`None`
`scale_pos_emb`		位置嵌入的缩放因子。默认为 None。	必需
`scale_alpha_value`		Scale alpha 值。默认为 None。	必需
`no_flash_attn`		禁用 Flash Attention。默认为 None。	必需
`num_experts_per_token`		每个 token 的专家数量。默认为 None。	必需
`cache_q4`	`bool`	使用 Q4 缓存。默认为 False。	`False`
`tokenizer_kwargs`		tokenizer 的额外关键字参数。默认为 {}。	必需
`gpu_split`		"auto"，或以 GB 为单位的每 GPU VRAM 分配。Auto 将使用 exllama 的自动拆分功能	必需
`low_mem`		启用 VRAM 优化，可能牺牲速度	必需
`verbose`		如果需要调试语句则启用	必需

返回值

类型	描述
一个 `ExLlamaV2Model` 实例。

引发

类型	描述
如果未安装 `exllamav2` 库，则引发 `ImportError`。

源代码位于 outlines/models/exllamav2.py

def exl2(
    model_path: str,
    draft_model_path: Optional[str] = None,
    max_seq_len: Optional[int] = None,
    cache_q4: bool = False,
    paged: bool = True,
    max_chunk_size: Optional[int] = None,
) -> ExLlamaV2Model:
    """
    Load an ExLlamaV2 model.

    Parameters
    ----------
    model_path (str)
        Path to the model directory.
    device (str)
        Device to load the model on. Pass in 'cuda' for GPU or 'cpu' for CPU
    max_seq_len (Optional[int], optional)
        Maximum sequence length. Defaults to None.
    scale_pos_emb (Optional[float], optional)
        Scale factor for positional embeddings. Defaults to None.
    scale_alpha_value (Optional[float], optional)
        Scale alpha value. Defaults to None.
    no_flash_attn (Optional[bool], optional)
        Disable flash attention. Defaults to None.
    num_experts_per_token (Optional[int], optional)
        Number of experts per token. Defaults to None.
    cache_q4 (bool, optional)
        Use Q4 cache. Defaults to False.
    tokenizer_kwargs (dict, optional)
        Additional keyword arguments for the tokenizer. Defaults to {}.
    gpu_split (str)
        \"auto\", or VRAM allocation per GPU in GB. Auto will use exllama's autosplit feature
    low_mem (bool, optional)
        Enable VRAM optimizations, potentially trading off speed
    verbose (bool, optional)
        Enable if you want debugging statements

    Returns
    -------
    An `ExLlamaV2Model` instance.

    Raises
    ------
    `ImportError` if the `exllamav2` library is not installed.

    """
    try:
        from exllamav2 import (
            ExLlamaV2,
            ExLlamaV2Cache,
            ExLlamaV2Cache_Q4,
            ExLlamaV2Config,
            ExLlamaV2Tokenizer,
        )
        from exllamav2.generator import ExLlamaV2DynamicGenerator

    except ImportError:
        raise ImportError(
            "The `exllamav2`, `transformers` and `torch` libraries needs to be installed in order to use `exllamav2` models. "
            "Please run `pip install transformers torch git+https://github.com/lapp0/exllamav2@sampler-logits-processor` "
            "Documentation: https://outlines.org.cn/outlines/latest/reference/models/exllamav2/"
        )
    config = ExLlamaV2Config(model_path)
    if max_chunk_size is not None:
        config.max_input_len = max_chunk_size
        config.max_attention_size = max_chunk_size**2

    config.arch_compat_overrides()
    model = ExLlamaV2(config)
    if max_seq_len is None:
        max_seq_len = -1
    if cache_q4:
        cache = ExLlamaV2Cache_Q4(model, max_seq_len=max_seq_len, lazy=True)
    else:
        cache = ExLlamaV2Cache(model, max_seq_len=max_seq_len, lazy=True)
    model.load_autosplit(cache, progress=True)

    print("Loading tokenizer...")
    tokenizer = ExLlamaV2Tokenizer(config)
    max_batch_size = 4 if paged else 1

    draft_model = None
    draft_cache = None
    if draft_model_path is not None:
        draft_config = ExLlamaV2Config(draft_model_path)
        draft_model = ExLlamaV2(draft_config)

        if cache_q4:
            draft_cache = ExLlamaV2Cache_Q4(
                draft_model, max_seq_len=max_seq_len, lazy=True
            )
        else:
            draft_cache = ExLlamaV2Cache(
                draft_model, max_seq_len=max_seq_len, lazy=True
            )

    # Initialize the generator with all default parameters
    generator = ExLlamaV2DynamicGenerator(
        model=model,
        cache=cache,
        draft_model=draft_model,
        draft_cache=draft_cache,
        tokenizer=tokenizer,
        max_batch_size=max_batch_size,
        use_ngram_draft=False,
        max_chunk_size=max_chunk_size,
        paged=paged,
    )
    max_seq_len = cache.max_seq_len

    outlines_tokenizer = OutlinesExLlamaV2Tokenizer(tokenizer)
    outlines_exl2_model = ExLlamaV2Model(generator, outlines_tokenizer, max_seq_len)
    return outlines_exl2_model

`llamacpp`

`LlamaCpp`

表示由 llama-cpp-python 库提供的模型。

我们包装来自提供模型库的模型，以便在 Outlines 中为它们提供相同的接口，并允许用户在不同提供者之间轻松切换。此类包装了 llama-cpp-python 库中的 llama_cpp.Llama 类。

源代码位于 outlines/models/llamacpp.py

class LlamaCpp:
    """Represents a model provided by the `llama-cpp-python` library.

    We wrap models from model providing libraries in order to give all of
    them the same interface in Outlines and allow users to easily switch
    between providers. This class wraps the `llama_cpp.Llama` class from the
    `llama-cpp-python` library.

    """

    def __init__(self, model: "Llama"):
        self.model = model

    @property
    def tokenizer(self):
        return LlamaCppTokenizer(self.model)

    def prepare_generation_parameters(
        self,
        generation_parameters: GenerationParameters,
        sampling_parameters: SamplingParameters,
        structure_logits_processor,
        **llama_cpp_params: Unpack[LlamaCppParams],
    ):
        """Prepare the generation parameters.

        `llama-cpp-python` uses different default values

        """
        from llama_cpp import LogitsProcessorList

        max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

        # We update `llama_cpp_params` with the values the user passed to the
        # generator.
        if "stop" not in llama_cpp_params:
            llama_cpp_params["stop"] = stop_at
        if "seed" not in llama_cpp_params:
            llama_cpp_params["seed"] = seed

        # Somehow `llama-cpp-python` generates `max_tokens + 1`  tokens
        if "max_tokens" not in llama_cpp_params:
            if max_tokens is None:
                llama_cpp_params["max_tokens"] = -1  # indicates unlimited tokens
            else:
                llama_cpp_params["max_tokens"] = max_tokens - 1
        else:
            llama_cpp_params["max_tokens"] = llama_cpp_params["max_tokens"] - 1

        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )

        # We update the `llama_cpp_params` with the sampling values that
        # were specified by the user via the `Sampler` class, unless they
        # are also specified in `llama_cpp_params`. We also disable other
        # sampling methods that are enabled by default and reset the temperature
        # value.
        #
        # See https://github.com/ggerganov/llama.cpp/blob/e11a8999b5690f810c2c99c14347f0834e68c524/common/sampling.h#L22
        # for the default values in `llama.cpp` and indications to disable the sampling modes.
        # Mirostat sampling, tail-free sampling and all penalties are disabled by default.
        #
        # See https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__
        # for default values in `llama-cpp-python`
        if sampler == "beam_search":
            raise NotImplementedError(
                "The `llama_cpp_python` library does not support Beam Search."
            )
        if num_samples != 1:
            raise NotImplementedError(
                "The `llama_cpp_python` library does not allow to take several samples."
            )
        if "top_p" not in llama_cpp_params:
            if top_p is not None:
                llama_cpp_params["top_p"] = top_p
            else:
                llama_cpp_params["top_p"] = 1.0

        if "min_p" not in llama_cpp_params:
            llama_cpp_params["min_p"] = 0.0

        if "top_k" not in llama_cpp_params:
            if top_k is not None:
                llama_cpp_params["top_k"] = top_k
            else:
                llama_cpp_params["top_k"] = -1

        if "temperature" not in llama_cpp_params:
            if temperature is not None:
                llama_cpp_params["temperature"] = temperature
            else:
                llama_cpp_params["temperature"] = 1.0

        if "repeat_penalty" not in llama_cpp_params:
            llama_cpp_params["repeat_penalty"] = 1.0

        # The choice to stream or not should happen via the high-level API
        llama_cpp_params["stream"] = False

        if structure_logits_processor is not None:
            if "logits_processor" in llama_cpp_params:
                llama_cpp_params["logits_processor"].append(structure_logits_processor)
            else:
                llama_cpp_params["logits_processor"] = LogitsProcessorList(
                    [structure_logits_processor]
                )

        return llama_cpp_params

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        structure_logits_processor,
        sampling_parameters: SamplingParameters,
        **llama_cpp_params: Unpack[LlamaCppParams],
    ) -> str:
        """Generate text using `llama-cpp-python`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.
        llama_cpp_params
            Keyword arguments that can be passed to
            `llama_cpp_python.Llama.__call__`.  The values in `llama_cpp_params`
            supersede the values of the parameters in `generation_parameters` and
            `sampling_parameters`.  See the `llama_cpp_python` documentation for
            a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__

        Returns
        -------
        The generated text.

        """
        if not isinstance(prompts, str):
            raise NotImplementedError(
                "The `llama-cpp-python` library does not support batch inference."
            )

        llama_cpp_params = self.prepare_generation_parameters(
            generation_parameters,
            sampling_parameters,
            structure_logits_processor,
            **llama_cpp_params,
        )
        completion = self.model(prompts, **llama_cpp_params)
        result = completion["choices"][0]["text"]

        self.model.reset()

        return result

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        structure_logits_processor,
        sampling_parameters: SamplingParameters,
        **llama_cpp_params: Unpack[LlamaCppParams],
    ) -> Iterator[str]:
        """Stream text using `llama-cpp-python`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.
        llama_cpp_params
            Keyword arguments that can be passed to
            `llama_cpp_python.Llama.__call__`.  The values in `llama_cpp_params`
            supersede the values of the parameters in `generation_parameters` and
            `sampling_parameters`.  See the `llama_cpp_python` documentation for
            a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__

        Returns
        -------
        A generator that return strings.

        """

        if not isinstance(prompts, str):
            raise NotImplementedError(
                "The `llama-cpp-python` library does not support batch inference."
            )

        llama_cpp_params = self.prepare_generation_parameters(
            generation_parameters,
            sampling_parameters,
            structure_logits_processor,
            **llama_cpp_params,
        )
        llama_cpp_params["stream"] = True
        generator = self.model(prompts, **llama_cpp_params)

        def token_generator() -> Iterator[str]:
            while True:
                try:
                    result = next(generator)
                    yield result["choices"][0]["text"]
                except StopIteration:
                    self.model.reset()
                    return

        return token_generator()

    def load_lora(self, adapter_path: str):
        if self.model._model.apply_lora_from_file(
            adapter_path,
            1.0,
        ):
            raise RuntimeError(f"Failed to apply LoRA from lora path: {adapter_path}")

`generate(prompts, generation_parameters, structure_logits_processor, sampling_parameters, **llama_cpp_params)`

使用 llama-cpp-python 生成文本。

参数

名称	类型	描述	默认值
`prompts`	`Union[str, List[str]]`	一个提示或提示列表。	必需
`generation_parameters`	`GenerationParameters`	一个 `GenerationParameters` 实例，包含提示、最大 token 数、停止序列和种子。所有参数都传递给 `SequenceGeneratorAdapter` 的 `__call__` 方法。	必需
`logits_processor`		生成文本时使用的 logits 处理器。	必需
`sampling_parameters`	`SamplingParameters`	一个 `SamplingParameters` 实例，一个数据类，包含要使用的采样器名称以及 Outlines 中可用的相关参数。	必需
`llama_cpp_params`	`Unpack[LlamaCppParams]`	可以传递给 `llama_cpp_python.Llama.__call__` 的关键字参数。`llama_cpp_params` 中的值会覆盖 `generation_parameters` 和 `sampling_parameters` 中的参数值。有关可能值的列表，请参阅 `llama_cpp_python` 文档：https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.call	`{}`

返回值

类型	描述
`生成的文本。`

源代码位于 outlines/models/llamacpp.py

def generate(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    structure_logits_processor,
    sampling_parameters: SamplingParameters,
    **llama_cpp_params: Unpack[LlamaCppParams],
) -> str:
    """Generate text using `llama-cpp-python`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.
    llama_cpp_params
        Keyword arguments that can be passed to
        `llama_cpp_python.Llama.__call__`.  The values in `llama_cpp_params`
        supersede the values of the parameters in `generation_parameters` and
        `sampling_parameters`.  See the `llama_cpp_python` documentation for
        a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__

    Returns
    -------
    The generated text.

    """
    if not isinstance(prompts, str):
        raise NotImplementedError(
            "The `llama-cpp-python` library does not support batch inference."
        )

    llama_cpp_params = self.prepare_generation_parameters(
        generation_parameters,
        sampling_parameters,
        structure_logits_processor,
        **llama_cpp_params,
    )
    completion = self.model(prompts, **llama_cpp_params)
    result = completion["choices"][0]["text"]

    self.model.reset()

    return result

`prepare_generation_parameters(generation_parameters, sampling_parameters, structure_logits_processor, **llama_cpp_params)`

准备生成参数。

llama-cpp-python 使用不同的默认值

源代码位于 outlines/models/llamacpp.py

def prepare_generation_parameters(
    self,
    generation_parameters: GenerationParameters,
    sampling_parameters: SamplingParameters,
    structure_logits_processor,
    **llama_cpp_params: Unpack[LlamaCppParams],
):
    """Prepare the generation parameters.

    `llama-cpp-python` uses different default values

    """
    from llama_cpp import LogitsProcessorList

    max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

    # We update `llama_cpp_params` with the values the user passed to the
    # generator.
    if "stop" not in llama_cpp_params:
        llama_cpp_params["stop"] = stop_at
    if "seed" not in llama_cpp_params:
        llama_cpp_params["seed"] = seed

    # Somehow `llama-cpp-python` generates `max_tokens + 1`  tokens
    if "max_tokens" not in llama_cpp_params:
        if max_tokens is None:
            llama_cpp_params["max_tokens"] = -1  # indicates unlimited tokens
        else:
            llama_cpp_params["max_tokens"] = max_tokens - 1
    else:
        llama_cpp_params["max_tokens"] = llama_cpp_params["max_tokens"] - 1

    sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
        sampling_parameters
    )

    # We update the `llama_cpp_params` with the sampling values that
    # were specified by the user via the `Sampler` class, unless they
    # are also specified in `llama_cpp_params`. We also disable other
    # sampling methods that are enabled by default and reset the temperature
    # value.
    #
    # See https://github.com/ggerganov/llama.cpp/blob/e11a8999b5690f810c2c99c14347f0834e68c524/common/sampling.h#L22
    # for the default values in `llama.cpp` and indications to disable the sampling modes.
    # Mirostat sampling, tail-free sampling and all penalties are disabled by default.
    #
    # See https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__
    # for default values in `llama-cpp-python`
    if sampler == "beam_search":
        raise NotImplementedError(
            "The `llama_cpp_python` library does not support Beam Search."
        )
    if num_samples != 1:
        raise NotImplementedError(
            "The `llama_cpp_python` library does not allow to take several samples."
        )
    if "top_p" not in llama_cpp_params:
        if top_p is not None:
            llama_cpp_params["top_p"] = top_p
        else:
            llama_cpp_params["top_p"] = 1.0

    if "min_p" not in llama_cpp_params:
        llama_cpp_params["min_p"] = 0.0

    if "top_k" not in llama_cpp_params:
        if top_k is not None:
            llama_cpp_params["top_k"] = top_k
        else:
            llama_cpp_params["top_k"] = -1

    if "temperature" not in llama_cpp_params:
        if temperature is not None:
            llama_cpp_params["temperature"] = temperature
        else:
            llama_cpp_params["temperature"] = 1.0

    if "repeat_penalty" not in llama_cpp_params:
        llama_cpp_params["repeat_penalty"] = 1.0

    # The choice to stream or not should happen via the high-level API
    llama_cpp_params["stream"] = False

    if structure_logits_processor is not None:
        if "logits_processor" in llama_cpp_params:
            llama_cpp_params["logits_processor"].append(structure_logits_processor)
        else:
            llama_cpp_params["logits_processor"] = LogitsProcessorList(
                [structure_logits_processor]
            )

    return llama_cpp_params

`stream(prompts, generation_parameters, structure_logits_processor, sampling_parameters, **llama_cpp_params)`

使用 llama-cpp-python 流式生成文本。

参数

名称	类型	描述	默认值
`prompts`	`Union[str, List[str]]`	一个提示或提示列表。	必需
`generation_parameters`	`GenerationParameters`	一个 `GenerationParameters` 实例，包含提示、最大 token 数、停止序列和种子。所有参数都传递给 `SequenceGeneratorAdapter` 的 `__call__` 方法。	必需
`logits_processor`		生成文本时使用的 logits 处理器。	必需
`sampling_parameters`	`SamplingParameters`	一个 `SamplingParameters` 实例，一个数据类，包含要使用的采样器名称以及 Outlines 中可用的相关参数。	必需
`llama_cpp_params`	`Unpack[LlamaCppParams]`	可以传递给 `llama_cpp_python.Llama.__call__` 的关键字参数。`llama_cpp_params` 中的值会覆盖 `generation_parameters` 和 `sampling_parameters` 中的参数值。有关可能值的列表，请参阅 `llama_cpp_python` 文档：https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.call	`{}`

返回值

类型	描述
`返回字符串的生成器。`

源代码位于 outlines/models/llamacpp.py

def stream(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    structure_logits_processor,
    sampling_parameters: SamplingParameters,
    **llama_cpp_params: Unpack[LlamaCppParams],
) -> Iterator[str]:
    """Stream text using `llama-cpp-python`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.
    llama_cpp_params
        Keyword arguments that can be passed to
        `llama_cpp_python.Llama.__call__`.  The values in `llama_cpp_params`
        supersede the values of the parameters in `generation_parameters` and
        `sampling_parameters`.  See the `llama_cpp_python` documentation for
        a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__

    Returns
    -------
    A generator that return strings.

    """

    if not isinstance(prompts, str):
        raise NotImplementedError(
            "The `llama-cpp-python` library does not support batch inference."
        )

    llama_cpp_params = self.prepare_generation_parameters(
        generation_parameters,
        sampling_parameters,
        structure_logits_processor,
        **llama_cpp_params,
    )
    llama_cpp_params["stream"] = True
    generator = self.model(prompts, **llama_cpp_params)

    def token_generator() -> Iterator[str]:
        while True:
            try:
                result = next(generator)
                yield result["choices"][0]["text"]
            except StopIteration:
                self.model.reset()
                return

    return token_generator()

`LlamaCppTokenizer`

基类: Tokenizer

源代码位于 outlines/models/llamacpp.py

class LlamaCppTokenizer(Tokenizer):
    def __init__(self, model: "Llama"):
        self.eos_token_id = model.token_eos()
        self.eos_token = model.tokenizer().decode([self.eos_token_id])
        self.pad_token_id = self.eos_token_id
        self.special_tokens: Set[str] = set()

        self.vocabulary: Dict[str, int] = dict()

        self.tokenizer = model.tokenizer()

        # TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
        self._hf_tokenizer = None
        try:
            self.vocabulary = model.tokenizer_.hf_tokenizer.get_vocab()
            self._hf_tokenizer = model.tokenizer_.hf_tokenizer
        except AttributeError:
            # ###
            for t in range(model.n_vocab()):
                token_piece = model.tokenizer().decode([t])
                self.vocabulary[token_piece] = t

        # ensure stable ordering of vocabulary
        self.vocabulary = {
            tok: tok_id
            for tok, tok_id in sorted(self.vocabulary.items(), key=lambda x: x[1])
        }

        self._hash = None

    def decode(self, token_ids: List[int]) -> List[str]:
        decoded_bytes = self.tokenizer.detokenize(token_ids)
        return [decoded_bytes.decode("utf-8", errors="ignore")]

    def encode(
        self, prompt: Union[str, List[str]], add_bos: bool = True, special: bool = True
    ) -> Tuple[List[int], List[int]]:
        if isinstance(prompt, list):
            raise NotImplementedError(
                "llama-cpp-python tokenizer doesn't support batch tokenization"
            )
        token_ids = self.tokenizer.tokenize(
            prompt.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
        )
        # generate attention mask, missing from llama-cpp-python
        attention_mask = [
            1 if token_id != self.pad_token_id else 0 for token_id in token_ids
        ]
        return token_ids, attention_mask

    def convert_token_to_string(self, token: str) -> str:
        if self._hf_tokenizer is not None:
            from transformers.file_utils import SPIECE_UNDERLINE

            token_str = self._hf_tokenizer.convert_tokens_to_string([token])
            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
                token_str = " " + token_str
            return token_str
        else:
            return token

    def __eq__(self, other):
        if not isinstance(other, LlamaCppTokenizer):
            return False
        return self.__getstate__() == other.__getstate__()

    def __hash__(self):
        if self._hash is None:
            self._hash = hash(pickle.dumps(self))
        return self._hash

    def __getstate__(self):
        """Create a stable representation for outlines.caching"""
        return (
            self.vocabulary,
            self.eos_token_id,
            self.eos_token,
            self.pad_token_id,
            sorted(self.special_tokens),
        )

    def __setstate__(self, state):
        raise NotImplementedError("Cannot load a pickled llamacpp tokenizer")

`getstate()`

为 outlines.caching 创建稳定表示

源代码位于 outlines/models/llamacpp.py

def __getstate__(self):
    """Create a stable representation for outlines.caching"""
    return (
        self.vocabulary,
        self.eos_token_id,
        self.eos_token,
        self.pad_token_id,
        sorted(self.special_tokens),
    )

`llamacpp(repo_id, filename=None, **llamacpp_model_params)`

从 llama-cpp-python 库加载模型。

我们使用 Llama.from_pretrained 类方法，可以直接从 HuggingFace hub 下载模型，而不是要求用户指定已下载模型的路径。仍然可以通过直接初始化 llama_cpp.Llama 来加载本地模型。

参数

名称	类型	描述	默认值
`repo_id`	`str`	模型仓库的名称。	必需
`filename`	`Optional[str]`	用于匹配仓库中模型文件的文件名或 glob 模式。	`None`
`llama_cpp_model_params`		Llama 特定的模型参数。有关完整列表，请参阅 `llama-cpp-python` 文档：https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.init	必需

源代码位于 outlines/models/llamacpp.py

def llamacpp(
    repo_id: str, filename: Optional[str] = None, **llamacpp_model_params
) -> LlamaCpp:
    """Load a model from the `llama-cpp-python` library.

    We use the `Llama.from_pretrained` classmethod that downloads models
    directly from the HuggingFace hub, instead of asking users to specify
    a path to the downloaded model. One can still load a local model
    by initializing `llama_cpp.Llama` directly.

    Parameters
    ----------
    repo_id
        The name of the model repository.
    filename:
        A filename of glob pattern to match the model file in the repo.
    llama_cpp_model_params
        Llama-specific model parameters. See the `llama-cpp-python` documentation
        for the full list: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__

    """
    from llama_cpp import Llama

    # Default to using the model's full context length
    if "n_ctx" not in llamacpp_model_params:
        llamacpp_model_params["n_ctx"] = 0

    if "verbose" not in llamacpp_model_params:
        llamacpp_model_params["verbose"] = False

    # TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
    if "tokenizer" not in llamacpp_model_params:
        warnings.warn(
            "The pre-tokenizer in `llama.cpp` handles unicode improperly "
            + "(https://github.com/ggerganov/llama.cpp/pull/5613)\n"
            + "Outlines may raise a `RuntimeError` when building the regex index.\n"
            + "To circumvent this error when using `models.llamacpp()` you may pass the argument"
            + "`tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(<hf_repo_id>)`\n"
        )

    model = Llama.from_pretrained(repo_id, filename, **llamacpp_model_params)

    return LlamaCpp(model)

`mlxlm`

`MLXLM`

表示一个 mlx_lm 模型

源代码位于 outlines/models/mlxlm.py

class MLXLM:
    """
    Represents an `mlx_lm` model
    """

    def __init__(
        self,
        model: "nn.Module",
        tokenizer: "PreTrainedTokenizer",
    ):
        self.model = model
        self.mlx_tokenizer = tokenizer  # returns mlx tensors, used for encode()
        self.tokenizer = TransformerTokenizer(
            tokenizer._tokenizer
        )  # _tokenizer is HF Tokenizer

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: "GenerationParameters",
        logits_processor,
        sampling_parameters: "SamplingParameters",
    ) -> str:
        streamer = self.stream(
            prompts, generation_parameters, logits_processor, sampling_parameters
        )
        return "".join(list(streamer))

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: "GenerationParameters",
        logits_processor,
        sampling_parameters: "SamplingParameters",
    ) -> Iterator[str]:
        """Generate text using `mlx_lm`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.

        Returns
        -------
        The generated text.
        """
        import mlx.core as mx

        max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )
        if max_tokens is None:
            max_tokens = int(1e9)

        if not isinstance(prompts, str):
            raise NotImplementedError(
                "The `mlx-lm` library does not support batch inference."
            )
        if sampler == "beam_search":
            raise NotImplementedError(
                "The `mlx-lm` library does not support Beam Search."
            )
        if num_samples != 1:
            raise NotImplementedError(
                "The `mlx-lm` library does not allow to take several samples."
            )
        if top_k is not None:
            raise NotImplementedError("The `mlx-lm` library does not support top_k.")
        if seed is not None:
            raise NotImplementedError("The `mlx-lm` library does not support seed.")
        if stop_at is not None:
            raise NotImplementedError("The `mlx-lm` library does not support stop_at.")

        generate_kwargs = {
            "temp": temperature,
            "top_p": top_p,
            "sampler": sampler,
            "logits_processor": logits_processor,
        }

        # Adapted from
        # https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L267
        prompt_tokens = mx.array(self.mlx_tokenizer.encode(prompts))

        detokenizer = self.mlx_tokenizer.detokenizer
        detokenizer.reset()

        for (token, prob), n in zip(
            self.generate_step(prompt_tokens, **generate_kwargs),
            range(max_tokens),
        ):
            if token == self.tokenizer.eos_token_id:
                break
            detokenizer.add_token(token)
            yield detokenizer.last_segment

        detokenizer.finalize()
        yield detokenizer.last_segment

    def generate_step(
        self,
        prompt: "mx.array",
        temp: Optional[float],
        top_p: Optional[float],
        sampler: str,
        logits_processor: "OutlinesLogitsProcessor",
    ) -> Generator[Tuple[int, float], None, None]:
        """
        Adapted from
        https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L129

        A generator producing token ids based on the given prompt from the model.

        Parameters
        ----------
        prompt
            The input prompt.
        temp
            The temperature for sampling, if 0 the argmax is used.
        top_p
            Nulceus sampling, higher means model considers more less likely words.
        sampler
            The sampler string defined by SequenceGeneratorAdapter
        logits_processor
            Augment logits before sampling.
        """
        import mlx.core as mx
        import mlx_lm

        temperature: float = temp or 1.0

        def sample(logits: "mx.array") -> Tuple["mx.array", float]:
            softmax_logits = mx.softmax(logits)

            if temperature == 0.0 or sampler == "greedy":
                token = mx.argmax(logits, axis=-1)
            elif sampler == "multinomial":
                if top_p is not None and top_p > 0 and top_p < 1.0:
                    token = mlx_lm.sample_utils.top_p_sampling(
                        logits, top_p, temperature
                    )
                else:
                    token = mx.random.categorical(logits * (1 / temperature))
            else:
                raise ValueError(f"Invalid mlx-lm sampler: `{sampler}`")

            prob = softmax_logits[0, token]
            return token, prob

        cache = mlx_lm.models.cache.make_prompt_cache(self.model)

        # kv cache contains processed input IDs, we pass the unprocessed inputs and cache to model()
        unprocessed_input_ids = prompt
        generated_ids: List[int] = []

        while True:
            logits = self.model(unprocessed_input_ids[None], cache=cache)
            logits = logits[:, -1, :]

            if logits_processor is not None:
                # convert to logits_processor 1d expectation, apply, then convert back
                logits_1d = logits.reshape(-1)
                logits_1d = logits_processor(generated_ids, logits_1d)
                logits = logits_1d.reshape(1, -1)

            new_token_single, prob = sample(logits)
            new_token = new_token_single.item()
            yield new_token, prob

            generated_ids.append(new_token)
            unprocessed_input_ids = new_token_single

`generate_step(prompt, temp, top_p, sampler, logits_processor)`

改编自 https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L129

根据模型给定的提示生成 token ID 的生成器。

参数

名称	类型	描述	默认值
`prompt`	`array`	输入提示。	必需
`temp`	`Optional[float]`	采样的温度，如果为 0 则使用 argmax。	必需
`top_p`	`Optional[float]`	核采样，值越高表示模型考虑的可能性较低的词越多。	必需
`sampler`	`str`	由 SequenceGeneratorAdapter 定义的采样器字符串	必需
`logits_processor`	`OutlinesLogitsProcessor`	在采样前增强 logits。	必需

源代码位于 outlines/models/mlxlm.py

def generate_step(
    self,
    prompt: "mx.array",
    temp: Optional[float],
    top_p: Optional[float],
    sampler: str,
    logits_processor: "OutlinesLogitsProcessor",
) -> Generator[Tuple[int, float], None, None]:
    """
    Adapted from
    https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L129

    A generator producing token ids based on the given prompt from the model.

    Parameters
    ----------
    prompt
        The input prompt.
    temp
        The temperature for sampling, if 0 the argmax is used.
    top_p
        Nulceus sampling, higher means model considers more less likely words.
    sampler
        The sampler string defined by SequenceGeneratorAdapter
    logits_processor
        Augment logits before sampling.
    """
    import mlx.core as mx
    import mlx_lm

    temperature: float = temp or 1.0

    def sample(logits: "mx.array") -> Tuple["mx.array", float]:
        softmax_logits = mx.softmax(logits)

        if temperature == 0.0 or sampler == "greedy":
            token = mx.argmax(logits, axis=-1)
        elif sampler == "multinomial":
            if top_p is not None and top_p > 0 and top_p < 1.0:
                token = mlx_lm.sample_utils.top_p_sampling(
                    logits, top_p, temperature
                )
            else:
                token = mx.random.categorical(logits * (1 / temperature))
        else:
            raise ValueError(f"Invalid mlx-lm sampler: `{sampler}`")

        prob = softmax_logits[0, token]
        return token, prob

    cache = mlx_lm.models.cache.make_prompt_cache(self.model)

    # kv cache contains processed input IDs, we pass the unprocessed inputs and cache to model()
    unprocessed_input_ids = prompt
    generated_ids: List[int] = []

    while True:
        logits = self.model(unprocessed_input_ids[None], cache=cache)
        logits = logits[:, -1, :]

        if logits_processor is not None:
            # convert to logits_processor 1d expectation, apply, then convert back
            logits_1d = logits.reshape(-1)
            logits_1d = logits_processor(generated_ids, logits_1d)
            logits = logits_1d.reshape(1, -1)

        new_token_single, prob = sample(logits)
        new_token = new_token_single.item()
        yield new_token, prob

        generated_ids.append(new_token)
        unprocessed_input_ids = new_token_single

`stream(prompts, generation_parameters, logits_processor, sampling_parameters)`

使用 mlx_lm 生成文本。

参数

名称	类型	描述	默认值
`prompts`	`Union[str, List[str]]`	一个提示或提示列表。	必需
`generation_parameters`	`GenerationParameters`	一个 `GenerationParameters` 实例，包含提示、最大 token 数、停止序列和种子。所有参数都传递给 `SequenceGeneratorAdapter` 的 `__call__` 方法。	必需
`logits_processor`		生成文本时使用的 logits 处理器。	必需
`sampling_parameters`	`SamplingParameters`	一个 `SamplingParameters` 实例，一个数据类，包含要使用的采样器名称以及 Outlines 中可用的相关参数。	必需

返回值

类型	描述
`生成的文本。`

源代码位于 outlines/models/mlxlm.py

def stream(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: "GenerationParameters",
    logits_processor,
    sampling_parameters: "SamplingParameters",
) -> Iterator[str]:
    """Generate text using `mlx_lm`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.

    Returns
    -------
    The generated text.
    """
    import mlx.core as mx

    max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
    sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
        sampling_parameters
    )
    if max_tokens is None:
        max_tokens = int(1e9)

    if not isinstance(prompts, str):
        raise NotImplementedError(
            "The `mlx-lm` library does not support batch inference."
        )
    if sampler == "beam_search":
        raise NotImplementedError(
            "The `mlx-lm` library does not support Beam Search."
        )
    if num_samples != 1:
        raise NotImplementedError(
            "The `mlx-lm` library does not allow to take several samples."
        )
    if top_k is not None:
        raise NotImplementedError("The `mlx-lm` library does not support top_k.")
    if seed is not None:
        raise NotImplementedError("The `mlx-lm` library does not support seed.")
    if stop_at is not None:
        raise NotImplementedError("The `mlx-lm` library does not support stop_at.")

    generate_kwargs = {
        "temp": temperature,
        "top_p": top_p,
        "sampler": sampler,
        "logits_processor": logits_processor,
    }

    # Adapted from
    # https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L267
    prompt_tokens = mx.array(self.mlx_tokenizer.encode(prompts))

    detokenizer = self.mlx_tokenizer.detokenizer
    detokenizer.reset()

    for (token, prob), n in zip(
        self.generate_step(prompt_tokens, **generate_kwargs),
        range(max_tokens),
    ):
        if token == self.tokenizer.eos_token_id:
            break
        detokenizer.add_token(token)
        yield detokenizer.last_segment

    detokenizer.finalize()
    yield detokenizer.last_segment

`mlxlm(model_name, tokenizer_config={}, model_config={}, adapter_path=None, lazy=False)`

从 mlx_lm 库实例化模型及其 tokenizer。

签名改编自 https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L422

参数

名称	类型	描述	默认值
`参数`		path_or_hf_repo (Path)：加载模型的路径或 huggingface 仓库。tokenizer_config (dict, 可选)：tokenizer 的特定配置参数。默认为空字典。model_config(dict, 可选)：模型的特定配置参数。默认为空字典。adapter_path (str, 可选)：LoRA 适配器路径。如果提供，则将 LoRA 层应用于模型。默认值：`None`。lazy (bool)：如果为 False，则评估模型参数，确保在返回前已加载到内存中，否则在需要时加载。默认值：`False`	必需

返回值

类型	描述
一个 `MLXLM` 模型实例。

源代码位于 outlines/models/mlxlm.py

def mlxlm(
    model_name: str,
    tokenizer_config: dict = {},
    model_config: dict = {},
    adapter_path: Optional[str] = None,
    lazy: bool = False,
):
    """Instantiate a model from the `mlx_lm` library and its tokenizer.

    Signature adapted from
    https://github.com/ml-explore/mlx-examples/blob/4872727/llms/mlx_lm/utils.py#L422

    Parameters
    ----------
    Args:
        path_or_hf_repo (Path): The path or the huggingface repository to load the model from.
        tokenizer_config (dict, optional): Configuration parameters specifically for the tokenizer.
            Defaults to an empty dictionary.
        model_config(dict, optional): Configuration parameters specifically for the model.
            Defaults to an empty dictionary.
        adapter_path (str, optional): Path to the LoRA adapters. If provided, applies LoRA layers
            to the model. Default: ``None``.
        lazy (bool): If False eval the model parameters to make sure they are
            loaded in memory before returning, otherwise they will be loaded
            when needed. Default: ``False``

    Returns
    -------
    A `MLXLM` model instance.

    """
    try:
        import mlx.core as mx
        import mlx_lm
    except ImportError:
        raise ImportError(
            "The `mlx_lm` library needs to be installed in order to use `mlx_lm` models."
        )
    if not mx.metal.is_available():
        raise RuntimeError("You cannot use `mlx_lm` without Apple Silicon (Metal)")

    model, tokenizer = mlx_lm.load(
        model_name,
        tokenizer_config=tokenizer_config,
        model_config=model_config,
        adapter_path=adapter_path,
        lazy=lazy,
    )
    return MLXLM(model, tokenizer)

`openai`

与 OpenAI API 集成。

`OpenAI`

表示 OpenAI API 的对象。

源代码位于 outlines/models/openai.py

class OpenAI:
    """An object that represents the OpenAI API."""

    def __init__(
        self,
        client,
        config,
        system_prompt: Optional[str] = None,
    ):
        """Create an `OpenAI` instance.

        This class supports the standard OpenAI API, the Azure OpeanAI API as
        well as compatible APIs that rely on the OpenAI client.

        Parameters
        ----------
        client
            An instance of the API's async client.
        config
            An instance of `OpenAIConfig`. Can be useful to specify some
            parameters that cannot be set by calling this class' methods.
        """

        self.client = client
        self.config = config

        # We count the total number of prompt and generated tokens as returned
        # by the OpenAI API, summed over all the requests performed with this
        # model instance.
        self.prompt_tokens = 0
        self.completion_tokens = 0

        self.format_sequence = lambda seq: seq

    def __call__(
        self,
        prompt: Union[str, List[str]],
        max_tokens: Optional[int] = None,
        stop_at: Optional[Union[List[str], str]] = None,
        *,
        system_prompt: Optional[str] = None,
        temperature: Optional[float] = None,
        samples: Optional[int] = None,
    ) -> np.ndarray:
        """Call the OpenAI API to generate text.

        Parameters
        ----------
        prompt
            A string or list of strings that will be used to prompt the model
        max_tokens
            The maximum number of tokens to generate
        stop_at
            A string or array of strings which, such that the generation stops
            when they are generated.
        system_prompt
            The content of the system message that precedes the user's prompt.
        temperature
            The value of the temperature used to sample tokens
        samples
            The number of completions to generate for each prompt
        stop_at
            Up to 4 words where the API will stop the completion.

        """
        if max_tokens is None:
            max_tokens = self.config.max_tokens
        if stop_at is None:
            stop_at = self.config.stop
        if temperature is None:
            temperature = self.config.temperature
        if samples is None:
            samples = self.config.n

        config = replace(
            self.config,
            max_tokens=max_tokens,
            temperature=temperature,
            n=samples,
            stop=stop_at,
        )  # type: ignore

        response, prompt_tokens, completion_tokens = generate_chat(
            prompt, system_prompt, self.client, config
        )
        self.prompt_tokens += prompt_tokens
        self.completion_tokens += completion_tokens

        return self.format_sequence(response)

    def stream(self, *args, **kwargs):
        raise NotImplementedError(
            "Streaming is currently not supported for the OpenAI API"
        )

    def new_with_replacements(self, **kwargs):
        new_instance = copy.copy(self)
        new_instance.config = replace(new_instance.config, **kwargs)
        return new_instance

    def __str__(self):
        return self.__class__.__name__ + " API"

    def __repr__(self):
        return str(self.config)

`call(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)`

调用 OpenAI API 生成文本。

参数

名称	类型	描述	默认值
`prompt`	`Union[str, List[str]]`	将用于提示模型的字符串或字符串列表	必需
`max_tokens`	`Optional[int]`	要生成的最大 token 数	`None`
`stop_at`	`Optional[Union[List[str], str]]`	字符串或字符串数组，生成到这些字符串时将停止生成。	`None`
`system_prompt`	`Optional[str]`	位于用户提示之前的系统消息内容。	`None`
`temperature`	`Optional[float]`	用于采样 token 的温度值	`None`
`samples`	`Optional[int]`	为每个提示生成的补全数量	`None`
`stop_at`	`Optional[Union[List[str], str]]`	API 将在生成的文本中停止的字符串，最多 4 个。	`None`

源代码位于 outlines/models/openai.py

def __call__(
    self,
    prompt: Union[str, List[str]],
    max_tokens: Optional[int] = None,
    stop_at: Optional[Union[List[str], str]] = None,
    *,
    system_prompt: Optional[str] = None,
    temperature: Optional[float] = None,
    samples: Optional[int] = None,
) -> np.ndarray:
    """Call the OpenAI API to generate text.

    Parameters
    ----------
    prompt
        A string or list of strings that will be used to prompt the model
    max_tokens
        The maximum number of tokens to generate
    stop_at
        A string or array of strings which, such that the generation stops
        when they are generated.
    system_prompt
        The content of the system message that precedes the user's prompt.
    temperature
        The value of the temperature used to sample tokens
    samples
        The number of completions to generate for each prompt
    stop_at
        Up to 4 words where the API will stop the completion.

    """
    if max_tokens is None:
        max_tokens = self.config.max_tokens
    if stop_at is None:
        stop_at = self.config.stop
    if temperature is None:
        temperature = self.config.temperature
    if samples is None:
        samples = self.config.n

    config = replace(
        self.config,
        max_tokens=max_tokens,
        temperature=temperature,
        n=samples,
        stop=stop_at,
    )  # type: ignore

    response, prompt_tokens, completion_tokens = generate_chat(
        prompt, system_prompt, self.client, config
    )
    self.prompt_tokens += prompt_tokens
    self.completion_tokens += completion_tokens

    return self.format_sequence(response)

`init(client, config, system_prompt=None)`

创建一个 OpenAI 实例。

此类支持标准 OpenAI API、Azure OpenAI API 以及依赖 OpenAI 客户端的兼容 API。

参数

名称	类型	描述	默认值
`client`		API 异步客户端的一个实例。	必需
`config`		一个 `OpenAIConfig` 实例。可用于指定无法通过调用此类的其他方法设置的一些参数。	必需

源代码位于 outlines/models/openai.py

def __init__(
    self,
    client,
    config,
    system_prompt: Optional[str] = None,
):
    """Create an `OpenAI` instance.

    This class supports the standard OpenAI API, the Azure OpeanAI API as
    well as compatible APIs that rely on the OpenAI client.

    Parameters
    ----------
    client
        An instance of the API's async client.
    config
        An instance of `OpenAIConfig`. Can be useful to specify some
        parameters that cannot be set by calling this class' methods.
    """

    self.client = client
    self.config = config

    # We count the total number of prompt and generated tokens as returned
    # by the OpenAI API, summed over all the requests performed with this
    # model instance.
    self.prompt_tokens = 0
    self.completion_tokens = 0

    self.format_sequence = lambda seq: seq

`OpenAIConfig` `数据类`

表示 OpenAI API 的参数。

信息最后获取于 2023/11/20。我们在下方记录了 OpenAI API 特有的属性。Outlines 并非支持所有这些属性。

参数

名称	类型	描述	默认值
`model`	`str`	模型名称。可在 OpenAI 网站上找到可用模型。	`''`
`frequency_penalty`	`float`	介于 2.0 和 -2.0 之间的数字。正值根据 token 在文本中出现的现有频率惩罚新 token，	`0`
`logit_bias`	`Dict[int, int]`	修改指定 token 出现在补全中的可能性。介于 -100（禁止）和 +100（仅允许）之间的数字。	`dict()`
`n`	`int`	为每个提示返回的补全数量。	`1`
`presence_penalty`	`float`	类似于 frequency penalty。	`0`
`response_format`	`Optional[Dict[str, str]]`	指定模型必须输出的格式。`{"type": "json_object"}` 启用 JSON 模式。	`None`
`seed`	`Optional[int]`	具有相同 `seed` 值的两个补全应该返回相同的补全。但这不保证。	`None`
`stop`	`Optional[Union[str, List[str]]]`	API 将在生成的文本中停止的字符串，最多 4 个。	`None`
`temperature`	`float`	介于 0 和 2 之间的数字。值越高，输出越随机；值越低，输出越确定。	`1.0`
`top_p`	`int`	介于 0 和 1 之间的数字。核采样的参数。	`1`
`user`	`str`	终端用户的唯一标识符。	`str()`

源代码位于 outlines/models/openai.py

@dataclass(frozen=True)
class OpenAIConfig:
    """Represents the parameters of the OpenAI API.

    The information was last fetched on 2023/11/20. We document below the
    properties that are specific to the OpenAI API. Not all these properties are
    supported by Outlines.

    Parameters
    ----------
    model
        The name of the model. Available models can be found on OpenAI's website.
    frequency_penalty
        Number between 2.0 and -2.0. Positive values penalize new tokens based on
        their existing frequency in the text,
    logit_bias
        Modifies the likelihood of specified tokens to appear in the completion.
        Number between -100 (forbid) and +100 (only allows).
    n
        The number of completions to return for each prompt.
    presence_penalty
        Similar to frequency penalty.
    response_format
        Specifies the format the model must output. `{"type": "json_object"}`
        enables JSON mode.
    seed
        Two completions with the same `seed` value should return the same
        completion. This is however not guaranteed.
    stop
        Up to 4 words where the API will stop the completion.
    temperature
        Number between 0 and 2. Higher values make the output more random, while
        lower values make it more deterministic.
    top_p
        Number between 0 and 1. Parameter for nucleus sampling.
    user
        A unique identifier for the end-user.
    """

    model: str = ""
    frequency_penalty: float = 0
    logit_bias: Dict[int, int] = field(default_factory=dict)
    max_tokens: Optional[int] = None
    n: int = 1
    presence_penalty: float = 0
    response_format: Optional[Dict[str, str]] = None
    seed: Optional[int] = None
    stop: Optional[Union[str, List[str]]] = None
    temperature: float = 1.0
    top_p: int = 1
    user: str = field(default_factory=str)

`error_handler(api_call_fn)`

处理 OpenAI API 错误和 API 密钥缺失。

源代码位于 outlines/models/openai.py

def error_handler(api_call_fn: Callable) -> Callable:
    """Handle OpenAI API errors and missing API key."""

    def call(*args, **kwargs):
        import openai

        try:
            return api_call_fn(*args, **kwargs)
        except (
            openai.APITimeoutError,
            openai.InternalServerError,
            openai.RateLimitError,
        ) as e:
            raise OSError(f"Could not connect to the OpenAI API: {e}")
        except (
            openai.AuthenticationError,
            openai.BadRequestError,
            openai.ConflictError,
            openai.PermissionDeniedError,
            openai.NotFoundError,
            openai.UnprocessableEntityError,
        ) as e:
            raise e

    return call

`generate_chat(prompt, system_prompt, client, config)` `异步`

调用 OpenAI 的聊天补全 API。

参数

名称	类型	描述	默认值
`prompt`	`str`	用于开始生成的提示。以“user”角色传递给模型。	必需
`system_prompt`	`Union[str, None]`	系统提示，在用户提示之前以“system”角色传递给模型。	必需
`client`		API 客户端	必需
`config`	`OpenAIConfig`	一个 `OpenAIConfig` 实例。	必需

返回值

类型	描述
`一个包含模型响应和使用统计信息的元组。`

源代码位于 outlines/models/openai.py

@functools.partial(vectorize, signature="(),(),(),()->(s),(),()")
async def generate_chat(
    prompt: str,
    system_prompt: Union[str, None],
    client,
    config: OpenAIConfig,
) -> Tuple[np.ndarray, int, int]:
    """Call OpenAI's Chat Completion API.

    Parameters
    ----------
    prompt
        The prompt we use to start the generation. Passed to the model
        with the "user" role.
    system_prompt
        The system prompt, passed to the model with the "system" role
        before the prompt.
    client
        The API client
    config
        An `OpenAIConfig` instance.

    Returns
    -------
    A tuple that contains the model's response(s) and usage statistics.

    """

    @error_handler
    @cache()
    async def call_api(prompt, system_prompt, config):
        responses = await client.chat.completions.create(
            messages=system_message + user_message,
            **asdict(config),  # type: ignore
        )
        return responses.model_dump()

    system_message = (
        [{"role": "system", "content": system_prompt}] if system_prompt else []
    )
    user_message = [{"role": "user", "content": prompt}]

    responses = await call_api(prompt, system_prompt, config)

    results = np.array(
        [responses["choices"][i]["message"]["content"] for i in range(config.n)]
    )
    usage = responses["usage"]

    return results, usage["prompt_tokens"], usage["completion_tokens"]

`tokenizer`

`Tokenizer`

基类: Hashable, Protocol

源代码位于 outlines/models/tokenizer.py

class Tokenizer(Hashable, Protocol):
    eos_token: str
    eos_token_id: int
    pad_token_id: int
    vocabulary: Dict[str, int]
    special_tokens: Set[str]

    def encode(
        self, prompt: Union[str, List[str]]
    ) -> Tuple[NDArray[np.int64], NDArray[np.int64]]:
        """Translate the input prompts into arrays of token ids and attention mask."""
        ...

    def decode(self, token_ids: NDArray[np.int64]) -> List[str]:
        """Translate an array of token ids to a string or list of strings."""
        ...

    def convert_token_to_string(self, token: str) -> str:
        """Convert a token to its equivalent string.

        This is for instance useful for BPE tokenizers where whitespaces are
        represented by the special characted `Ġ`. This prevents matching a raw
        token that includes `Ġ` with a string.
        """
        ...

`convert_token_to_string(token)`

将 token 转换为其等效字符串。

例如，这对于 BPE tokenizers 非常有用，其中空格由特殊字符 Ġ 表示。这可以防止将包含 Ġ 的原始 token 与字符串匹配。

源代码位于 outlines/models/tokenizer.py

def convert_token_to_string(self, token: str) -> str:
    """Convert a token to its equivalent string.

    This is for instance useful for BPE tokenizers where whitespaces are
    represented by the special characted `Ġ`. This prevents matching a raw
    token that includes `Ġ` with a string.
    """
    ...

`decode(token_ids)`

将 token ID 数组转换为字符串或字符串列表。

源代码位于 outlines/models/tokenizer.py

20
21
22

def decode(self, token_ids: NDArray[np.int64]) -> List[str]:
    """Translate an array of token ids to a string or list of strings."""
    ...

`encode(prompt)`

将输入提示转换为 token ID 数组和注意力掩码。

源代码位于 outlines/models/tokenizer.py

def encode(
    self, prompt: Union[str, List[str]]
) -> Tuple[NDArray[np.int64], NDArray[np.int64]]:
    """Translate the input prompts into arrays of token ids and attention mask."""
    ...

`transformers`

`TransformerTokenizer`

基类: Tokenizer

表示 transformers 库中模型的 tokenizer。

源代码位于 outlines/models/transformers.py

class TransformerTokenizer(Tokenizer):
    """Represents a tokenizer for models in the `transformers` library."""

    def __init__(self, tokenizer: "PreTrainedTokenizer", **kwargs):
        self.tokenizer = tokenizer
        self.eos_token_id = self.tokenizer.eos_token_id
        self.eos_token = self.tokenizer.eos_token

        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
            self.pad_token_id = self.eos_token_id
        else:
            self.pad_token_id = self.tokenizer.pad_token_id
            self.pad_token = self.tokenizer.pad_token

        self.special_tokens = set(self.tokenizer.all_special_tokens)

        self.vocabulary = self.tokenizer.get_vocab()
        self.is_llama = isinstance(self.tokenizer, get_llama_tokenizer_types())

    def encode(
        self, prompt: Union[str, List[str]], **kwargs
    ) -> Tuple["torch.LongTensor", "torch.LongTensor"]:
        kwargs["padding"] = True
        kwargs["return_tensors"] = "pt"
        output = self.tokenizer(prompt, **kwargs)
        return output["input_ids"], output["attention_mask"]

    def decode(self, token_ids: "torch.LongTensor") -> List[str]:
        text = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
        return text

    def convert_token_to_string(self, token: str) -> str:
        from transformers.file_utils import SPIECE_UNDERLINE

        string = self.tokenizer.convert_tokens_to_string([token])

        if self.is_llama:
            # A hack to handle missing spaces to HF's Llama tokenizers
            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
                return " " + string

        return string

    def __eq__(self, other):
        if isinstance(other, type(self)):
            if hasattr(self, "model_name") and hasattr(self, "kwargs"):
                return (
                    other.model_name == self.model_name and other.kwargs == self.kwargs
                )
            else:
                return other.tokenizer == self.tokenizer
        return NotImplemented

    def __hash__(self):
        from datasets.fingerprint import Hasher

        return hash(Hasher.hash(self.tokenizer))

    def __getstate__(self):
        state = {"tokenizer": self.tokenizer}
        return state

    def __setstate__(self, state):
        self.__init__(state["tokenizer"])

`Transformers`

表示一个 transformers 模型。

源代码位于 outlines/models/transformers.py

class Transformers:
    """Represents a `transformers` model."""

    def __init__(
        self,
        model: "PreTrainedModel",
        tokenizer: "PreTrainedTokenizer",
    ):
        self.model = model
        self.tokenizer = TransformerTokenizer(tokenizer)

    def forward(
        self,
        input_ids: "torch.LongTensor",
        attention_mask: "torch.LongTensor",
        past_key_values: Optional[Tuple] = None,
    ) -> Tuple["torch.FloatTensor", Optional[KVCacheType]]:
        """Compute a forward pass through the transformer model.

        Parameters
        ----------
        input_ids
            The input token ids.  Must be one or two dimensional.
        attention_mask
            The attention mask.  Must be one or two dimensional.
        past_key_values
            A tuple of tuples containing the cached key and value tensors for each
            attention head.

        Returns
        -------
        The computed logits and the new cached key and value tensors.

        """
        try:
            import torch
        except ImportError:
            ImportError(
                "The `torch` library needs to be installed to use `transformers` models."
            )
        assert 0 < input_ids.ndim < 3

        if past_key_values:
            input_ids = input_ids[..., -1].unsqueeze(-1)

        with torch.inference_mode():
            output = self.model(
                input_ids,
                attention_mask=attention_mask,
                return_dict=True,
                output_attentions=False,
                output_hidden_states=False,
                past_key_values=past_key_values,
            )

        return output.logits, output.past_key_values

    def __call__(
        self,
        input_ids: "torch.LongTensor",
        attention_mask: "torch.LongTensor",
        past_key_values: Optional[Tuple] = None,
    ) -> "torch.FloatTensor":
        logits, kv_cache = self.forward(input_ids, attention_mask, past_key_values)
        next_token_logits = logits[..., -1, :]

        return next_token_logits, kv_cache

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Union[str, List[str], List[List[str]]]:
        """Generate text using `transformers`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.

        Returns
        -------
        The generated text
        """
        if isinstance(prompts, str):
            # convert to 2d
            input_ids, attention_mask = self.tokenizer.encode([prompts])
        else:
            input_ids, attention_mask = self.tokenizer.encode(prompts)

        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": attention_mask.to(self.model.device),
        }
        if (
            "attention_mask"
            not in inspect.signature(self.model.forward).parameters.keys()
        ):
            del inputs["attention_mask"]

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            generated_ids = generated_ids.squeeze(0)

        return self._decode_generation(generated_ids)

    def stream(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Iterator[Union[str, List[str]]]:
        """
        Temporary stream stand-in which implements stream() signature
        and equivalent behaviour but isn't yielded until generation completes.

        TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810
        """
        if isinstance(prompts, str):
            # convert to 2d
            input_ids, attention_mask = self.tokenizer.encode([prompts])
        else:
            input_ids, attention_mask = self.tokenizer.encode(prompts)
        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": attention_mask.to(self.model.device),
        }
        if (
            "attention_mask"
            not in inspect.signature(self.model.forward).parameters.keys()
        ):
            del inputs["attention_mask"]

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            generated_ids = generated_ids.squeeze(0)

        for i in range(generated_ids.size(-1)):
            output_group_ids = generated_ids.select(-1, i).unsqueeze(-1)
            yield self._decode_generation(output_group_ids)

    def _get_generation_kwargs(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> dict:
        """
        Convert outlines generation parameters into model.generate kwargs
        """
        from transformers import GenerationConfig, LogitsProcessorList, set_seed

        max_new_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )
        if max_new_tokens is None:
            max_new_tokens = int(2**30)

        # global seed, not desirable
        if seed is not None:
            set_seed(seed)

        if logits_processor is not None:
            logits_processor_list = LogitsProcessorList([logits_processor])
        else:
            logits_processor_list = None

        generation_config = GenerationConfig(
            max_new_tokens=max_new_tokens,
            stop_strings=stop_at,
            num_return_sequences=(num_samples or 1),
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            do_sample=(sampler == "multinomial"),
            num_beams=(num_samples if sampler == "beam_search" else 1),
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
        )

        return dict(
            logits_processor=logits_processor_list,
            generation_config=generation_config,
            tokenizer=self.tokenizer.tokenizer,
        )

    def _generate_output_seq(
        self, prompts, inputs, generation_config, **generation_kwargs
    ):
        input_ids = inputs["input_ids"]
        output_ids = self.model.generate(
            **inputs, generation_config=generation_config, **generation_kwargs
        )

        # encoder-decoder returns output_ids only, decoder-only returns full seq ids
        if self.model.config.is_encoder_decoder:
            generated_ids = output_ids
        else:
            generated_ids = output_ids[:, input_ids.shape[1] :]

        # if batch list inputs AND multiple samples per input, convert generated_id to 3D view
        num_samples = generation_config.num_return_sequences or 1

        if num_samples > 1 and isinstance(prompts, list):
            batch_size = input_ids.size(0)
            num_return_sequences = generation_config.num_return_sequences or 1
            generated_ids = generated_ids.view(batch_size, num_return_sequences, -1)

        return generated_ids

    def _decode_generation(self, generated_ids: "torch.Tensor"):
        if len(generated_ids.shape) == 1:
            return self.tokenizer.decode([generated_ids])[0]
        elif len(generated_ids.shape) == 2:
            return self.tokenizer.decode(generated_ids)
        elif len(generated_ids.shape) == 3:
            return [
                self.tokenizer.decode(generated_ids[i])
                for i in range(len(generated_ids))
            ]
        else:
            raise TypeError(
                f"Generated outputs aren't 1D, 2D or 3D, but instead are {generated_ids.shape}"
            )

`forward(input_ids, attention_mask, past_key_values=None)`

计算通过 transformer 模型的正向传递。

参数

名称	类型	描述	默认值
`input_ids`	`LongTensor`	输入的 token ID。必须是一维或二维。	必需
`attention_mask`	`LongTensor`	注意力掩码。必须是一维或二维。	必需
`past_key_values`	`Optional[Tuple]`	一个包含每个注意力头缓存的键和值张量的元组的元组。	`None`

返回值

类型	描述
`计算出的 logits 以及新的缓存键和值张量。`

源代码位于 outlines/models/transformers.py

def forward(
    self,
    input_ids: "torch.LongTensor",
    attention_mask: "torch.LongTensor",
    past_key_values: Optional[Tuple] = None,
) -> Tuple["torch.FloatTensor", Optional[KVCacheType]]:
    """Compute a forward pass through the transformer model.

    Parameters
    ----------
    input_ids
        The input token ids.  Must be one or two dimensional.
    attention_mask
        The attention mask.  Must be one or two dimensional.
    past_key_values
        A tuple of tuples containing the cached key and value tensors for each
        attention head.

    Returns
    -------
    The computed logits and the new cached key and value tensors.

    """
    try:
        import torch
    except ImportError:
        ImportError(
            "The `torch` library needs to be installed to use `transformers` models."
        )
    assert 0 < input_ids.ndim < 3

    if past_key_values:
        input_ids = input_ids[..., -1].unsqueeze(-1)

    with torch.inference_mode():
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            return_dict=True,
            output_attentions=False,
            output_hidden_states=False,
            past_key_values=past_key_values,
        )

    return output.logits, output.past_key_values

`generate(prompts, generation_parameters, logits_processor, sampling_parameters)`

使用 transformers 生成文本。

参数

名称	类型	描述	默认值
`prompts`	`Union[str, List[str]]`	一个提示或提示列表。	必需
`generation_parameters`	`GenerationParameters`	一个 `GenerationParameters` 实例，包含提示、最大 token 数、停止序列和种子。所有参数都传递给 `SequenceGeneratorAdapter` 的 `__call__` 方法。	必需
`logits_processor`	`Optional[OutlinesLogitsProcessor]`	生成文本时使用的 logits 处理器。	必需
`sampling_parameters`	`SamplingParameters`	一个 `SamplingParameters` 实例，一个数据类，包含要使用的采样器名称以及 Outlines 中可用的相关参数。	必需

返回值

类型	描述
`生成的文本`

源代码位于 outlines/models/transformers.py

def generate(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Union[str, List[str], List[List[str]]]:
    """Generate text using `transformers`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.

    Returns
    -------
    The generated text
    """
    if isinstance(prompts, str):
        # convert to 2d
        input_ids, attention_mask = self.tokenizer.encode([prompts])
    else:
        input_ids, attention_mask = self.tokenizer.encode(prompts)

    inputs = {
        "input_ids": input_ids.to(self.model.device),
        "attention_mask": attention_mask.to(self.model.device),
    }
    if (
        "attention_mask"
        not in inspect.signature(self.model.forward).parameters.keys()
    ):
        del inputs["attention_mask"]

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        generated_ids = generated_ids.squeeze(0)

    return self._decode_generation(generated_ids)

`stream(prompts, generation_parameters, logits_processor, sampling_parameters)`

临时的流替代，实现了 stream() 签名和等效行为，但在生成完成之前不会 yield。

TODO：在 https://github.com/huggingface/transformers/issues/30810 完成后实现

源代码位于 outlines/models/transformers.py

def stream(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Iterator[Union[str, List[str]]]:
    """
    Temporary stream stand-in which implements stream() signature
    and equivalent behaviour but isn't yielded until generation completes.

    TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810
    """
    if isinstance(prompts, str):
        # convert to 2d
        input_ids, attention_mask = self.tokenizer.encode([prompts])
    else:
        input_ids, attention_mask = self.tokenizer.encode(prompts)
    inputs = {
        "input_ids": input_ids.to(self.model.device),
        "attention_mask": attention_mask.to(self.model.device),
    }
    if (
        "attention_mask"
        not in inspect.signature(self.model.forward).parameters.keys()
    ):
        del inputs["attention_mask"]

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        generated_ids = generated_ids.squeeze(0)

    for i in range(generated_ids.size(-1)):
        output_group_ids = generated_ids.select(-1, i).unsqueeze(-1)
        yield self._decode_generation(output_group_ids)

`get_llama_tokenizer_types()`

获取所有需要 work-around 的 Llama tokenizer 类型/类。

如果无法导入它们，则创建一个 dummy 类。

源代码位于 outlines/models/transformers.py

def get_llama_tokenizer_types():
    """Get all the Llama tokenizer types/classes that need work-arounds.

    When they can't be imported, a dummy class is created.

    """
    try:
        from transformers.models.llama import LlamaTokenizer
    except ImportError:

        class LlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.llama import LlamaTokenizerFast
    except ImportError:

        class LlamaTokenizerFast:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizer
    except ImportError:

        class CodeLlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizerFast
    except ImportError:

        class CodeLlamaTokenizerFast:  # type: ignore
            pass

    return (
        LlamaTokenizer,
        LlamaTokenizerFast,
        CodeLlamaTokenizer,
        CodeLlamaTokenizerFast,
    )

`transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)`

从 transformers 库实例化模型及其 tokenizer。

参数

名称	类型	描述	默认值
`model_name`	`str`	模型名称，列在 Hugging Face 模型页面上。	必需
`device`	`Optional[str]`	模型应加载到的设备。如果提供，此参数将覆盖 `model_kwargs` 中的 `device_map` 条目。	`None`
`model_kwargs`	`dict`	一个字典，包含加载模型时要传递给 `from_pretrained` 方法的关键字参数。	`{}`
`tokenizer_kwargs`	`dict`	一个字典，包含加载 tokenizer 时要传递给 `from_pretrained` 方法的关键字参数。	`{}`

返回值

类型	描述
一个 `TransformersModel` 模型实例。

源代码位于 outlines/models/transformers.py

def transformers(
    model_name: str,
    device: Optional[str] = None,
    model_kwargs: dict = {},
    tokenizer_kwargs: dict = {},
    model_class=None,
    tokenizer_class=None,
):
    """Instantiate a model from the `transformers` library and its tokenizer.

    Parameters
    ----------
    model_name
        The name of the model as listed on Hugging Face's model page.
    device
        The device(s) on which the model should be loaded. This overrides
        the `device_map` entry in `model_kwargs` when provided.
    model_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the model.
    tokenizer_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the tokenizer.

    Returns
    -------
    A `TransformersModel` model instance.

    """
    if model_class is None or tokenizer_class is None:
        try:
            from transformers import AutoModelForCausalLM, AutoTokenizer
        except ImportError:
            raise ImportError(
                "The `transformers` library needs to be installed in order to use `transformers` models."
            )
    if model_class is None:
        model_class = AutoModelForCausalLM
    if tokenizer_class is None:
        tokenizer_class = AutoTokenizer

    if device is not None:
        model_kwargs["device_map"] = device

    model = model_class.from_pretrained(model_name, **model_kwargs)

    tokenizer_kwargs.setdefault("padding_side", "left")
    tokenizer = tokenizer_class.from_pretrained(model_name, **tokenizer_kwargs)

    return Transformers(model, tokenizer)

`transformers_vision`

`TransformersVision`

基类: Transformers

源代码位于 outlines/models/transformers_vision.py

class TransformersVision(Transformers):
    def __init__(self, model, tokenizer, processor):
        super().__init__(model, tokenizer)
        self.processor = processor

    def generate(  # type: ignore
        self,
        prompts: Union[str, List[str]],
        media: Union[List[Any], List[List[Any]]],
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Union[str, List[str], List[List[str]]]:
        """Generate text using `transformers`.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        media
            A List[PIL.Image] or List[List[PIL.Image]]
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.

        Returns
        -------
        The generated text
        """
        inputs = self.processor(
            text=prompts, images=media, padding=True, return_tensors="pt"
        ).to(self.model.device)

        generation_kwargs = self._get_generation_kwargs(
            prompts,
            generation_parameters,
            logits_processor,
            sampling_parameters,
        )
        generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

        # if single str input and single sample per input, convert to a 1D output
        if isinstance(prompts, str):
            # Should always be true until NotImplementedError above is fixed
            generated_ids = generated_ids.squeeze(0)

        return self._decode_generation(generated_ids)

    def stream(  # type: ignore
        self,
        prompts: Union[str, List[str]],
        media: Union[Any, List[Any]],  # TODO: docstring
        generation_parameters: GenerationParameters,
        logits_processor: Optional["OutlinesLogitsProcessor"],
        sampling_parameters: SamplingParameters,
    ) -> Iterator[Union[str, List[str]]]:
        raise NotImplementedError

`generate(prompts, media, generation_parameters, logits_processor, sampling_parameters)`

使用 transformers 生成文本。

参数

名称	类型	描述	默认值
`prompts`	`Union[str, List[str]]`	一个提示或提示列表。	必需
`media`	`Union[List[Any], List[List[Any]]]`	一个 List[PIL.Image] 或 List[List[PIL.Image]]	必需
`generation_parameters`	`GenerationParameters`	一个 `GenerationParameters` 实例，包含提示、最大 token 数、停止序列和种子。所有参数都传递给 `SequenceGeneratorAdapter` 的 `__call__` 方法。	必需
`logits_processor`	`Optional[OutlinesLogitsProcessor]`	生成文本时使用的 logits 处理器。	必需
`sampling_parameters`	`SamplingParameters`	一个 `SamplingParameters` 实例，一个数据类，包含要使用的采样器名称以及 Outlines 中可用的相关参数。	必需

返回值

类型	描述
`生成的文本`

源代码位于 outlines/models/transformers_vision.py

def generate(  # type: ignore
    self,
    prompts: Union[str, List[str]],
    media: Union[List[Any], List[List[Any]]],
    generation_parameters: GenerationParameters,
    logits_processor: Optional["OutlinesLogitsProcessor"],
    sampling_parameters: SamplingParameters,
) -> Union[str, List[str], List[List[str]]]:
    """Generate text using `transformers`.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    media
        A List[PIL.Image] or List[List[PIL.Image]]
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.

    Returns
    -------
    The generated text
    """
    inputs = self.processor(
        text=prompts, images=media, padding=True, return_tensors="pt"
    ).to(self.model.device)

    generation_kwargs = self._get_generation_kwargs(
        prompts,
        generation_parameters,
        logits_processor,
        sampling_parameters,
    )
    generated_ids = self._generate_output_seq(prompts, inputs, **generation_kwargs)

    # if single str input and single sample per input, convert to a 1D output
    if isinstance(prompts, str):
        # Should always be true until NotImplementedError above is fixed
        generated_ids = generated_ids.squeeze(0)

    return self._decode_generation(generated_ids)

`transformers_vision(model_name, model_class, device=None, model_kwargs={}, processor_kwargs={}, tokenizer_class=None, processor_class=None)`

从 transformers 库实例化模型及其 tokenizer。

参数

名称	类型	描述	默认值
`model_name`	`str`	模型名称，列在 Hugging Face 模型页面上。	必需
`model_class`		用于从 `model_name` 初始化视觉模型的 transformers 库中的 `PreTrainedModel` 类。https://hugging-face.cn/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel	必需
`device`	`Optional[str]`	模型应加载到的设备。如果提供，此参数将覆盖 `model_kwargs` 中的 `device_map` 条目。	`None`
`model_kwargs`	`dict`	一个字典，包含加载模型时要传递给 `from_pretrained` 方法的关键字参数。	`{}`
`processor_kwargs`	`dict`	一个字典，包含加载处理器时要传递给 `from_pretrained` 方法的关键字参数。	`{}`

返回值

类型	描述
一个 `TransformersModel` 模型实例。

源代码位于 outlines/models/transformers_vision.py

def transformers_vision(
    model_name: str,
    model_class,
    device: Optional[str] = None,
    model_kwargs: dict = {},
    processor_kwargs: dict = {},
    tokenizer_class=None,
    processor_class=None,
):
    """Instantiate a model from the `transformers` library and its tokenizer.

    Parameters
    ----------
    model_name
        The name of the model as listed on Hugging Face's model page.
    model_class
        The `PreTrainedModel` class from transformers to use in initializing the vision model from `model_name`.
        https://hugging-face.cn/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel
    device
        The device(s) on which the model should be loaded. This overrides
        the `device_map` entry in `model_kwargs` when provided.
    model_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the model.
    processor_kwargs
        A dictionary that contains the keyword arguments to pass to the
        `from_pretrained` method when loading the processor.

    Returns
    -------
    A `TransformersModel` model instance.

    """
    if processor_class is None or tokenizer_class is None:
        try:
            from transformers import AutoProcessor, AutoTokenizer
        except ImportError:
            raise ImportError(
                "The `transformers` library needs to be installed in order to use `transformers` models."
            )
    if processor_class is None:
        processor_class = AutoProcessor
    if tokenizer_class is None:
        tokenizer_class = AutoTokenizer

    if device is not None:
        model_kwargs["device_map"] = device

    model = model_class.from_pretrained(model_name, **model_kwargs)

    processor_kwargs.setdefault("padding_side", "left")
    processor_kwargs.setdefault("pad_token", "[PAD]")
    processor = processor_class.from_pretrained(model_name, **processor_kwargs)

    if tokenizer_class is None:
        if getattr(processor, "tokenizer", None):
            tokenizer = processor.tokenizer
        else:
            tokenizer = AutoTokenizer.from_pretrained(model_name, **processor_kwargs)
    else:
        tokenizer = tokenizer_class.from_pretrained(model_name, **processor_kwargs)

    return TransformersVision(model, tokenizer, processor)

`vllm`

`VLLM`

表示一个 vLLM 模型。

我们包装来自提供模型库的模型，以便在 Outlines 中为它们提供相同的接口，并允许用户在不同提供者之间轻松切换。此类包装了 vllm 库中的 vllm.LLM 类。

源代码位于 outlines/models/vllm.py

class VLLM:
    """Represents a vLLM model.

    We wrap models from model providing libraries in order to give all of
    them the same interface in Outlines and allow users to easily switch
    between providers. This class wraps the `vllm.LLM` class from the
    `vllm` library.

    """

    def __init__(self, model: "LLM"):
        self.model = model
        self.lora_request = None

        self.tokenizer = self._get_tokenizer()

    def _get_tokenizer(self):
        if hasattr(self.model, "get_tokenizer"):
            tokenizer = self.model.get_tokenizer()
        elif hasattr(self.model, "tokenizer"):
            if hasattr(self.model.tokenizer, "tokenizer"):
                tokenizer = self.model.tokenizer.tokenizer
            else:
                tokenizer = self.model.tokenizer
        else:
            raise ValueError(
                "The provided LLM instance neither has a "
                "`tokenizer` attribute or a `get_tokenizer` method."
            )
        return adapt_tokenizer(tokenizer=tokenizer)

    def generate(
        self,
        prompts: Union[str, List[str]],
        generation_parameters: GenerationParameters,
        logits_processor,
        sampling_parameters: SamplingParameters,
        *,
        sampling_params: Optional["SamplingParams"] = None,
        use_tqdm: bool = True,
    ):
        """Generate text using vLLM.

        Parameters
        ----------
        prompts
            A prompt or list of prompts.
        generation_parameters
            An instance of `GenerationParameters` that contains the prompt,
            the maximum number of tokens, stop sequences and seed. All the
            arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
        logits_processor
            The logits processor to use when generating text.
        sampling_parameters
            An instance of `SamplingParameters`, a dataclass that contains
            the name of the sampler to use and related parameters as available
            in Outlines.
        sampling_params
            An instance of `vllm.sampling_params.SamplingParams`. The values
            passed via this dataclass supersede the values of the parameters
            in `generation_parameters` and `sampling_parameters`. See the
            vLLM documentation for more details: https://docs.vllm.com.cn/en/latest/dev/sampling_params.html.
        use_tqdm
            A boolean in order to display progress bar while inferencing

        Returns
        -------
        The generated text, of shape `(n_batch, n_samples)`. If there are only
        one batch and several samples, the list is of shape `(n_samples)`. If
        this is a batch with several sequences but only one sample the list is
        of shape `(n_batch)`. If there is only one sequence and one sample, a
        string is returned.

        """
        from vllm.sampling_params import SamplingParams

        if sampling_params is None:
            sampling_params = SamplingParams()

        max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

        # We only update the values in `sampling_params` if they
        # are specified by the user when calling the generator.
        if max_tokens is not None:
            sampling_params.max_tokens = max_tokens
        if stop_at is not None:
            if isinstance(stop_at, str):
                stop_at = [stop_at]
            sampling_params.stop = stop_at
        if seed is not None:
            sampling_params.seed = seed

        sampling_params.logits_processors = (
            [logits_processor] if logits_processor is not None else []
        )

        sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
            sampling_parameters
        )

        # We only update the values in `sampling_params` that
        # were not specified by the user.
        if sampling_params.n == 1:
            sampling_params.n = num_samples
            sampling_params.best_of = num_samples
        if top_p is not None and sampling_params.top_p == 1.0:
            sampling_params.top_p = top_p
        if top_k is not None and sampling_params.top_k == -1:
            sampling_params.top_k = top_k
            # TODO: remove this if statement once fixed
            # https://github.com/vllm-project/vllm/issues/5404#issuecomment-2175972897
            if top_k == 1:
                sampling_params.repetition_penalty = 0
        if temperature is not None and sampling_params.temperature == 1.0:
            sampling_params.temperature = temperature
        if sampler == "beam_search":
            sampling_params.use_beam_search = True

        results = self.model.generate(
            prompts,
            sampling_params=sampling_params,
            lora_request=self.lora_request,
            use_tqdm=use_tqdm,
        )
        results = [[sample.text for sample in batch.outputs] for batch in results]

        batch_size = len(results)
        sample_size = len(results[0])

        if batch_size == 1 and sample_size == 1:
            return results[0][0]
        elif batch_size == 1:
            return results[0]
        elif sample_size == 1:
            return [batch[0] for batch in results]

        return results

    def stream(self, *args, **kwargs):
        """Return a text generator.

        Streaming is not yet available for `vllm.LLM`.

        TODO: Implement the streaming functionality ourselves.

        """
        raise NotImplementedError(
            "Streaming is not available for the vLLM integration."
        )

    def load_lora(self, adapter_path: Optional[str]):
        from vllm.lora.request import LoRARequest

        if adapter_path is None:
            self.lora_request = None
        else:
            self.lora_request = LoRARequest(adapter_path, 1, adapter_path)

`generate(prompts, generation_parameters, logits_processor, sampling_parameters, *, sampling_params=None, use_tqdm=True)`

使用 vLLM 生成文本。

参数

名称	类型	描述	默认值
`prompts`	`Union[str, List[str]]`	一个提示或提示列表。	必需
`generation_parameters`	`GenerationParameters`	一个 `GenerationParameters` 实例，包含提示、最大 token 数、停止序列和种子。所有参数都传递给 `SequenceGeneratorAdapter` 的 `__call__` 方法。	必需
`logits_processor`		生成文本时使用的 logits 处理器。	必需
`sampling_parameters`	`SamplingParameters`	一个 `SamplingParameters` 实例，一个数据类，包含要使用的采样器名称以及 Outlines 中可用的相关参数。	必需
`sampling_params`	`Optional[SamplingParams]`	一个 `vllm.sampling_params.SamplingParams` 实例。通过此数据类传递的值会覆盖 `generation_parameters` 和 `sampling_parameters` 中的参数值。有关更多详细信息，请参阅 vLLM 文档：https://docs.vllm.com.cn/en/latest/dev/sampling_params.html。	`None`
`use_tqdm`	`bool`	一个布尔值，用于在推理时显示进度条	`True`

返回值

类型	描述
`生成的文本，形状为 (n_batch, n_samples)。如果只有`
`一个批次和多个样本，列表的形状为 (n_samples)。如果`
`这是一个包含多个序列但只有一个样本的批次，列表的形状为`
`(n_batch)。如果只有一个序列和一个样本，则返回`
`一个字符串。`

源代码位于 outlines/models/vllm.py

def generate(
    self,
    prompts: Union[str, List[str]],
    generation_parameters: GenerationParameters,
    logits_processor,
    sampling_parameters: SamplingParameters,
    *,
    sampling_params: Optional["SamplingParams"] = None,
    use_tqdm: bool = True,
):
    """Generate text using vLLM.

    Parameters
    ----------
    prompts
        A prompt or list of prompts.
    generation_parameters
        An instance of `GenerationParameters` that contains the prompt,
        the maximum number of tokens, stop sequences and seed. All the
        arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
    logits_processor
        The logits processor to use when generating text.
    sampling_parameters
        An instance of `SamplingParameters`, a dataclass that contains
        the name of the sampler to use and related parameters as available
        in Outlines.
    sampling_params
        An instance of `vllm.sampling_params.SamplingParams`. The values
        passed via this dataclass supersede the values of the parameters
        in `generation_parameters` and `sampling_parameters`. See the
        vLLM documentation for more details: https://docs.vllm.com.cn/en/latest/dev/sampling_params.html.
    use_tqdm
        A boolean in order to display progress bar while inferencing

    Returns
    -------
    The generated text, of shape `(n_batch, n_samples)`. If there are only
    one batch and several samples, the list is of shape `(n_samples)`. If
    this is a batch with several sequences but only one sample the list is
    of shape `(n_batch)`. If there is only one sequence and one sample, a
    string is returned.

    """
    from vllm.sampling_params import SamplingParams

    if sampling_params is None:
        sampling_params = SamplingParams()

    max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)

    # We only update the values in `sampling_params` if they
    # are specified by the user when calling the generator.
    if max_tokens is not None:
        sampling_params.max_tokens = max_tokens
    if stop_at is not None:
        if isinstance(stop_at, str):
            stop_at = [stop_at]
        sampling_params.stop = stop_at
    if seed is not None:
        sampling_params.seed = seed

    sampling_params.logits_processors = (
        [logits_processor] if logits_processor is not None else []
    )

    sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
        sampling_parameters
    )

    # We only update the values in `sampling_params` that
    # were not specified by the user.
    if sampling_params.n == 1:
        sampling_params.n = num_samples
        sampling_params.best_of = num_samples
    if top_p is not None and sampling_params.top_p == 1.0:
        sampling_params.top_p = top_p
    if top_k is not None and sampling_params.top_k == -1:
        sampling_params.top_k = top_k
        # TODO: remove this if statement once fixed
        # https://github.com/vllm-project/vllm/issues/5404#issuecomment-2175972897
        if top_k == 1:
            sampling_params.repetition_penalty = 0
    if temperature is not None and sampling_params.temperature == 1.0:
        sampling_params.temperature = temperature
    if sampler == "beam_search":
        sampling_params.use_beam_search = True

    results = self.model.generate(
        prompts,
        sampling_params=sampling_params,
        lora_request=self.lora_request,
        use_tqdm=use_tqdm,
    )
    results = [[sample.text for sample in batch.outputs] for batch in results]

    batch_size = len(results)
    sample_size = len(results[0])

    if batch_size == 1 and sample_size == 1:
        return results[0][0]
    elif batch_size == 1:
        return results[0]
    elif sample_size == 1:
        return [batch[0] for batch in results]

    return results

`stream(*args, **kwargs)`

返回文本生成器。

vllm.LLM 尚不支持流式传输。

TODO：自行实现流式传输功能。

源代码位于 outlines/models/vllm.py

def stream(self, *args, **kwargs):
    """Return a text generator.

    Streaming is not yet available for `vllm.LLM`.

    TODO: Implement the streaming functionality ourselves.

    """
    raise NotImplementedError(
        "Streaming is not available for the vLLM integration."
    )

`adapt_tokenizer(tokenizer)`

调整 tokenizer 以用于编译 FSM。

Outlines tokenizer 的 API 与 transformers 略有不同。此外，我们需要处理 Llama tokenizer 缺失的空格，才能为该模型编译 FSM。

参数

名称	类型	描述	默认值
`tokenizer`	`PreTrainedTokenizerBase`	模型的 tokenizer。	必需

返回值

类型	描述
`PreTrainedTokenizerBase`	调整后的 tokenizer。

源代码位于 outlines/models/vllm.py

def adapt_tokenizer(tokenizer: "PreTrainedTokenizerBase") -> "PreTrainedTokenizerBase":
    """Adapt a tokenizer to use to compile the FSM.

    The API of Outlines tokenizers is slightly different to that of `transformers`. In
    addition we need to handle the missing spaces to Llama's tokenizer to be able to
    compile FSMs for this model.

    Parameters
    ----------
    tokenizer
        The tokenizer of the model.

    Returns
    -------
    PreTrainedTokenizerBase
        The adapted tokenizer.
    """
    from transformers import SPIECE_UNDERLINE

    tokenizer.vocabulary = tokenizer.get_vocab()
    tokenizer.special_tokens = set(tokenizer.all_special_tokens)

    def convert_token_to_string(token: Union[str, bytes]) -> str:
        string = tokenizer.convert_tokens_to_string([token])

        # A hack to handle missing spaces to HF's Llama tokenizers
        if (
            type(token) is str
            and token.startswith(SPIECE_UNDERLINE)
            or token == "<0x20>"
        ):
            return " " + string

        return string

    tokenizer.convert_token_to_string = convert_token_to_string

    return tokenizer

`vllm(model_name, **vllm_model_params)`

加载 vLLM 模型。

参数

名称	类型	描述	默认值
`model_name`	`str`	要从 HuggingFace hub 加载的模型名称。	必需
`vllm_model_params`		vLLM 特定的模型参数。有关完整列表，请参阅 vLLM 代码：https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py	`{}`

源代码位于 outlines/models/vllm.py

def vllm(model_name: str, **vllm_model_params):
    """Load a vLLM model.

    Parameters
    ---------
    model_name
        The name of the model to load from the HuggingFace hub.
    vllm_model_params
        vLLM-specific model parameters. See the vLLM code for the full list:
        https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py

    """
    from vllm import LLM

    model = LLM(model_name, **vllm_model_params)

    return VLLM(model)

模型

exllamav2

ExLlamaV2Model

prepare_generation_parameters(prompts, generation_parameters, sampling_parameters, structure_logits_processor, **exllamav2_params)

reformat_output(output, sampling_parameters)

exl2(model_path, draft_model_path=None, max_seq_len=None, cache_q4=False, paged=True, max_chunk_size=None)

llamacpp

LlamaCpp

generate(prompts, generation_parameters, structure_logits_processor, sampling_parameters, **llama_cpp_params)

prepare_generation_parameters(generation_parameters, sampling_parameters, structure_logits_processor, **llama_cpp_params)

stream(prompts, generation_parameters, structure_logits_processor, sampling_parameters, **llama_cpp_params)

LlamaCppTokenizer

__getstate__()

llamacpp(repo_id, filename=None, **llamacpp_model_params)

mlxlm

MLXLM

generate_step(prompt, temp, top_p, sampler, logits_processor)

stream(prompts, generation_parameters, logits_processor, sampling_parameters)

mlxlm(model_name, tokenizer_config={}, model_config={}, adapter_path=None, lazy=False)

openai

OpenAI

__call__(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)

__init__(client, config, system_prompt=None)

OpenAIConfig 数据类

error_handler(api_call_fn)

generate_chat(prompt, system_prompt, client, config) 异步

tokenizer

Tokenizer

convert_token_to_string(token)

decode(token_ids)

encode(prompt)

transformers

TransformerTokenizer

Transformers

forward(input_ids, attention_mask, past_key_values=None)

generate(prompts, generation_parameters, logits_processor, sampling_parameters)

stream(prompts, generation_parameters, logits_processor, sampling_parameters)

get_llama_tokenizer_types()

transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)

transformers_vision

TransformersVision

generate(prompts, media, generation_parameters, logits_processor, sampling_parameters)

transformers_vision(model_name, model_class, device=None, model_kwargs={}, processor_kwargs={}, tokenizer_class=None, processor_class=None)

vllm

VLLM

generate(prompts, generation_parameters, logits_processor, sampling_parameters, *, sampling_params=None, use_tqdm=True)

stream(*args, **kwargs)

adapt_tokenizer(tokenizer)

vllm(model_name, **vllm_model_params)

`exllamav2`

`ExLlamaV2Model`

`prepare_generation_parameters(prompts, generation_parameters, sampling_parameters, structure_logits_processor, **exllamav2_params)`

`reformat_output(output, sampling_parameters)`

`exl2(model_path, draft_model_path=None, max_seq_len=None, cache_q4=False, paged=True, max_chunk_size=None)`

`llamacpp`

`LlamaCpp`

`generate(prompts, generation_parameters, structure_logits_processor, sampling_parameters, **llama_cpp_params)`

`prepare_generation_parameters(generation_parameters, sampling_parameters, structure_logits_processor, **llama_cpp_params)`

`stream(prompts, generation_parameters, structure_logits_processor, sampling_parameters, **llama_cpp_params)`

`LlamaCppTokenizer`

`getstate()`

`llamacpp(repo_id, filename=None, **llamacpp_model_params)`

`mlxlm`

`MLXLM`

`generate_step(prompt, temp, top_p, sampler, logits_processor)`

`stream(prompts, generation_parameters, logits_processor, sampling_parameters)`

`mlxlm(model_name, tokenizer_config={}, model_config={}, adapter_path=None, lazy=False)`

`openai`

`OpenAI`

`call(prompt, max_tokens=None, stop_at=None, *, system_prompt=None, temperature=None, samples=None)`

`init(client, config, system_prompt=None)`

`OpenAIConfig` `数据类`

`error_handler(api_call_fn)`

`generate_chat(prompt, system_prompt, client, config)` `异步`

`tokenizer`

`Tokenizer`

`convert_token_to_string(token)`

`decode(token_ids)`

`encode(prompt)`

`transformers`

`TransformerTokenizer`

`Transformers`

`forward(input_ids, attention_mask, past_key_values=None)`

`generate(prompts, generation_parameters, logits_processor, sampling_parameters)`

`stream(prompts, generation_parameters, logits_processor, sampling_parameters)`

`get_llama_tokenizer_types()`

`transformers(model_name, device=None, model_kwargs={}, tokenizer_kwargs={}, model_class=None, tokenizer_class=None)`

`transformers_vision`

`TransformersVision`

`generate(prompts, media, generation_parameters, logits_processor, sampling_parameters)`

`transformers_vision(model_name, model_class, device=None, model_kwargs={}, processor_kwargs={}, tokenizer_class=None, processor_class=None)`

`vllm`

`VLLM`

`generate(prompts, generation_parameters, logits_processor, sampling_parameters, *, sampling_params=None, use_tqdm=True)`

`stream(*args, **kwargs)`

`adapt_tokenizer(tokenizer)`

`vllm(model_name, **vllm_model_params)`