Allow passing kwargs to all models (#222)

* Allow passing kwargs to all models
2025-01-16 23:03:38 +01:00 · 2025-01-16 23:03:38 +01:00 · b4091cb5ce
parent a1d8f3c398
commit b4091cb5ce
4 changed files with 79 additions and 37 deletions
--- a/docs/source/en/reference/tools.md
+++ b/docs/source/en/reference/tools.md
@ -57,6 +57,10 @@ contains the API docs for the underlying classes.
 [[autodoc]] VisitWebpageTool
 ### UserInputTool
 [[autodoc]] UserInputTool
 ## ToolCollection
 [[autodoc]] ToolCollection
--- a/src/smolagents/default_tools.py
+++ b/src/smolagents/default_tools.py
@ -144,7 +144,7 @@ class UserInputTool(Tool):
    output_type = "string"
    def forward(self, question):
-        user_input = input(f"{question} => ")
+        user_input = input(f"{question} => Type your answer here:")
        return user_input
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@ -222,7 +222,6 @@ class Model:
        messages: List[Dict[str, str]],
        stop_sequences: Optional[List[str]] = None,
        grammar: Optional[str] = None,
        max_tokens: int = 1500,
    ) -> ChatMessage:
        """Process the input messages and return the model's response.
@ -233,8 +232,6 @@ class Model:
                A list of strings that will stop the generation if encountered in the model's output.
            grammar (`str`, *optional*):
                The grammar or formatting structure to use in the model's response.
            max_tokens (`int`, *optional*):
                The maximum count of tokens to generate.
        Returns:
            `str`: The text content of the model's response.
        """
@ -244,7 +241,7 @@ class Model:
 class HfApiModel(Model):
    """A class to interact with Hugging Face's Inference API for language model interaction.
-    This engine allows you to communicate with Hugging Face's models using the Inference API. It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization.
+    This model allows you to communicate with Hugging Face's models using the Inference API. It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization.
    Parameters:
        model_id (`str`, *optional*, defaults to `"Qwen/Qwen2.5-Coder-32B-Instruct"`):
@ -265,9 +262,10 @@ class HfApiModel(Model):
    >>> engine = HfApiModel(
    ...     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    ...     token="your_hf_token_here",
    ...     max_tokens=5000,
    ... )
    >>> messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}]
-    >>> response = engine(messages, stop_sequences=["END"], max_tokens=1500)
+    >>> response = engine(messages, stop_sequences=["END"])
    >>> print(response)
    "Quantum mechanics is the branch of physics that studies..."
    ```
@ -279,6 +277,7 @@ class HfApiModel(Model):
        token: Optional[str] = None,
        timeout: Optional[int] = 120,
        temperature: float = 0.5,
        **kwargs,
    ):
        super().__init__()
        self.model_id = model_id
@ -286,13 +285,13 @@ class HfApiModel(Model):
            token = os.getenv("HF_TOKEN")
        self.client = InferenceClient(self.model_id, token=token, timeout=timeout)
        self.temperature = temperature
        self.kwargs = kwargs
    def __call__(
        self,
        messages: List[Dict[str, str]],
        stop_sequences: Optional[List[str]] = None,
        grammar: Optional[str] = None,
        max_tokens: int = 1500,
        tools_to_call_from: Optional[List[Tool]] = None,
    ) -> ChatMessage:
        """
@ -308,16 +307,16 @@ class HfApiModel(Model):
                tools=[get_json_schema(tool) for tool in tools_to_call_from],
                tool_choice="auto",
                stop=stop_sequences,
                max_tokens=max_tokens,
                temperature=self.temperature,
                **self.kwargs,
            )
        else:
            response = self.client.chat.completions.create(
                model=self.model_id,
                messages=messages,
                stop=stop_sequences,
                max_tokens=max_tokens,
                temperature=self.temperature,
                **self.kwargs,
            )
        self.last_input_token_count = response.usage.prompt_tokens
        self.last_output_token_count = response.usage.completion_tokens
@ -325,16 +324,44 @@ class HfApiModel(Model):
 class TransformersModel(Model):
-    """This engine initializes a model and tokenizer from the given `model_id`.
+    """A class to interact with Hugging Face's Inference API for language model interaction.
    This model allows you to communicate with Hugging Face's models using the Inference API. It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization.
    Parameters:
-        model_id (`str`, *optional*, defaults to `"HuggingFaceTB/SmolLM2-1.7B-Instruct"`):
+        model_id (`str`, *optional*, defaults to `"Qwen/Qwen2.5-Coder-32B-Instruct"`):
            The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub.
-        device (`str`, optional, defaults to `"cuda"` if available, else `"cpu"`.):
+        device_map (`str`, *optional*):
-            The device to load the model on (`"cpu"` or `"cuda"`).
+            The device_map to initialize your model with.
        torch_dtype (`str`, *optional*):
            The torch_dtype to initialize your model with.
        kwargs (dict, *optional*):
            Any additional keyword arguments that you want to use in model.generate(), for instance `max_new_tokens` or `device`.
    Raises:
        ValueError:
            If the model name is not provided.
    Example:
    ```python
    >>> engine = TransformersModel(
    ...     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
    ...     device="cuda",
    ...     max_new_tokens=5000,
    ... )
    >>> messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}]
    >>> response = engine(messages, stop_sequences=["END"])
    >>> print(response)
    "Quantum mechanics is the branch of physics that studies..."
    ```
    """
-    def __init__(self, model_id: Optional[str] = None, device: Optional[str] = None):
+    def __init__(
        self,
        model_id: Optional[str] = None,
        device_map: Optional[str] = None,
        torch_dtype: Optional[str] = None,
        **kwargs,
    ):
        super().__init__()
        if not is_torch_available():
            raise ImportError("Please install torch in order to use TransformersModel.")
@ -347,14 +374,14 @@ class TransformersModel(Model):
                f"`model_id`not provided, using this default tokenizer for token counts: '{model_id}'"
            )
        self.model_id = model_id
-        if device is None:
+        self.kwargs = kwargs
-            device = "cuda" if torch.cuda.is_available() else "cpu"
+        if device_map is None:
-        self.device = device
+            device_map = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"Using device: {self.device}")
+        logger.info(f"Using device: {device_map}")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_id)
            self.model = AutoModelForCausalLM.from_pretrained(
-                model_id, device_map=self.device
+                model_id, device_map=device_map, torch_dtype=torch_dtype
            )
        except Exception as e:
            logger.warning(
@ -363,7 +390,7 @@ class TransformersModel(Model):
            self.model_id = default_model_id
            self.tokenizer = AutoTokenizer.from_pretrained(default_model_id)
            self.model = AutoModelForCausalLM.from_pretrained(
-                model_id, device_map=self.device
+                model_id, device_map=device_map, torch_dtype=torch_dtype
            )
    def make_stopping_criteria(self, stop_sequences: List[str]) -> StoppingCriteriaList:
@ -397,7 +424,6 @@ class TransformersModel(Model):
        messages: List[Dict[str, str]],
        stop_sequences: Optional[List[str]] = None,
        grammar: Optional[str] = None,
        max_tokens: int = 1500,
        tools_to_call_from: Optional[List[Tool]] = None,
    ) -> ChatMessage:
        messages = get_clean_message_list(
@ -422,10 +448,10 @@ class TransformersModel(Model):
        out = self.model.generate(
            **prompt_tensor,
            max_new_tokens=max_tokens,
            stopping_criteria=(
                self.make_stopping_criteria(stop_sequences) if stop_sequences else None
            ),
            **self.kwargs,
        )
        generated_tokens = out[0, count_prompt_tokens:]
        output = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
@ -458,6 +484,19 @@ class TransformersModel(Model):
 class LiteLLMModel(Model):
    """This model connects to [LiteLLM](https://www.litellm.ai/) as a gateway to hundreds of LLMs.
    Parameters:
        model_id (`str`):
            The model identifier to use on the server (e.g. "gpt-3.5-turbo").
        api_base (`str`):
            The base URL of the OpenAI-compatible API server.
        api_key (`str`):
            The API key to use for authentication.
        **kwargs:
            Additional keyword arguments to pass to the OpenAI API.
    """
    def __init__(
        self,
        model_id="anthropic/claude-3-5-sonnet-20240620",
@ -482,7 +521,6 @@ class LiteLLMModel(Model):
        messages: List[Dict[str, str]],
        stop_sequences: Optional[List[str]] = None,
        grammar: Optional[str] = None,
        max_tokens: int = 1500,
        tools_to_call_from: Optional[List[Tool]] = None,
    ) -> ChatMessage:
        messages = get_clean_message_list(
@ -495,7 +533,6 @@ class LiteLLMModel(Model):
                tools=[get_json_schema(tool) for tool in tools_to_call_from],
                tool_choice="required",
                stop=stop_sequences,
                max_tokens=max_tokens,
                api_base=self.api_base,
                api_key=self.api_key,
                **self.kwargs,
@ -505,7 +542,6 @@ class LiteLLMModel(Model):
                model=self.model_id,
                messages=messages,
                stop=stop_sequences,
                max_tokens=max_tokens,
                api_base=self.api_base,
                api_key=self.api_key,
                **self.kwargs,
@ -516,7 +552,7 @@ class LiteLLMModel(Model):
 class OpenAIServerModel(Model):
-    """This engine connects to an OpenAI-compatible API server.
+    """This model connects to an OpenAI-compatible API server.
    Parameters:
        model_id (`str`):
@ -525,8 +561,6 @@ class OpenAIServerModel(Model):
            The base URL of the OpenAI-compatible API server.
        api_key (`str`):
            The API key to use for authentication.
        temperature (`float`, *optional*, defaults to 0.7):
            Controls randomness in the model's responses. Values between 0 and 2.
        **kwargs:
            Additional keyword arguments to pass to the OpenAI API.
    """
@ -536,7 +570,6 @@ class OpenAIServerModel(Model):
        model_id: str,
        api_base: str,
        api_key: str,
        temperature: float = 0.7,
        **kwargs,
    ):
        super().__init__()
@ -545,7 +578,6 @@ class OpenAIServerModel(Model):
            base_url=api_base,
            api_key=api_key,
        )
        self.temperature = temperature
        self.kwargs = kwargs
    def __call__(
@ -553,7 +585,6 @@ class OpenAIServerModel(Model):
        messages: List[Dict[str, str]],
        stop_sequences: Optional[List[str]] = None,
        grammar: Optional[str] = None,
        max_tokens: int = 1500,
        tools_to_call_from: Optional[List[Tool]] = None,
    ) -> ChatMessage:
        messages = get_clean_message_list(
@ -566,8 +597,6 @@ class OpenAIServerModel(Model):
                tools=[get_json_schema(tool) for tool in tools_to_call_from],
                tool_choice="auto",
                stop=stop_sequences,
                max_tokens=max_tokens,
                temperature=self.temperature,
                **self.kwargs,
            )
        else:
@ -575,8 +604,6 @@ class OpenAIServerModel(Model):
                model=self.model_id,
                messages=messages,
                stop=stop_sequences,
                max_tokens=max_tokens,
                temperature=self.temperature,
                **self.kwargs,
            )
        self.last_input_token_count = response.usage.prompt_tokens
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -16,7 +16,7 @@ import unittest
 import json
 from typing import Optional
-from smolagents import models, tool, ChatMessage, HfApiModel
+from smolagents import models, tool, ChatMessage, HfApiModel, TransformersModel
 class ModelTests(unittest.TestCase):
@ -46,6 +46,17 @@ class ModelTests(unittest.TestCase):
        assert data["content"] == "Hello!"
    def test_get_hfapi_message_no_tool(self):
-        model = HfApiModel()
+        model = HfApiModel(max_tokens=10)
        messages = [{"role": "user", "content": "Hello!"}]
        model(messages, stop_sequences=["great"])
    def test_transformers_message_no_tool(self):
        model = TransformersModel(
            model_id="HuggingFaceTB/SmolLM2-135M-Instruct",
            max_new_tokens=5,
            device_map="auto",
            do_sample=False,
        )
        messages = [{"role": "user", "content": "Hello!"}]
        output = model(messages, stop_sequences=["great"]).content
        assert output == "assistant\nHello"