OpenMLRL · LovelyBuggies · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -28,3 +28,32 @@ jobs:
       - name: Test imports
         run: |
           python -c "import comlrl; print('✓ comlrl imported successfully')"
+
+  unit-tests:
+    name: Unit Tests
+    runs-on: ubuntu-latest
+    env:
+      COMLRL_TEST_MODEL_NAME: Qwen/Qwen2.5-0.5B
+      COMLRL_TEST_MODEL_NAME_ALT: Qwen/Qwen3-0.6B
+      COMLRL_TEST_CRITIC_MODEL_NAME: Qwen/Qwen3-0.6B-Instruct
+      TOKENIZERS_PARALLELISM: "false"
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install --no-cache-dir setuptools-scm>=8.0.0
+          pip install --no-cache-dir -r requirements.txt
+          pip install --no-cache-dir pytest
+          pip install --no-cache-dir -e . --no-deps
+
+      - name: Run unit tests
+        run: |
+          pytest -q
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@
 [![PyPI downloads](https://img.shields.io/pypi/dm/comlrl.svg?logo=data:image/svg+xml;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjQgMjQiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjYjliMWIxIj48ZyBpZD0iU1ZHUmVwb19iZ0NhcnJpZXIiIHN0cm9rZS13aWR0aD0iMCI+PC9nPjxnIGlkPSJTVkdSZXBvX3RyYWNlckNhcnJpZXIiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIgc3Ryb2tlLWxpbmVqb2luPSJyb3VuZCI+PC9nPjxnIGlkPSJTVkdSZXBvX2ljb25DYXJyaWVyIj4gPHBhdGggZD0iTTIyIDIwLjgyMDFDMTUuNDI2IDIyLjM5MiA4LjU3NCAyMi4zOTIgMiAyMC44MjAxIiBzdHJva2U9IiNiM2IyYjIiIHN0cm9rZS13aWR0aD0iMS41IiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiPjwvcGF0aD4gPHBhdGggZD0iTTExLjk0OTIgMlYxNiIgc3Ryb2tlPSIjYjNiMmIyIiBzdHJva2Utd2lkdGg9IjEuNSIgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIiBzdHJva2UtbGluZWpvaW49InJvdW5kIj48L3BhdGg+IDxwYXRoIGQ9Ik0xNi44OTk2IDExLjhMMTMuMzc5NiAxNS40MDk5QzEzLjIwMTEgMTUuNTk3OCAxMi45ODYzIDE1Ljc0NzYgMTIuNzQ4MiAxNS44NDk5QzEyLjUxMDEgMTUuOTUyMSAxMi4yNTM4IDE2LjAwNDYgMTEuOTk0NiAxNi4wMDQ2QzExLjczNTUgMTYuMDA0NiAxMS40NzkxIDE1Ljk1MjEgMTEuMjQxIDE1Ljg0OTlDMTEuMDAyOSAxNS43NDc2IDEwLjc4ODEgMTUuNTk3OCAxMC42MDk2IDE1LjQwOTlMNy4wOTk2MSAxMS44IiBzdHJva2U9IiNiM2IyYjIiIHN0cm9rZS13aWR0aD0iMS41IiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiPjwvcGF0aD4gPC9nPjwvc3ZnPg==)](https://pypi.org/project/comlrl/)
 
 [![CI](https://github.com/OpenMLRL/CoMLRL/actions/workflows/ci.yml/badge.svg)](https://github.com/OpenMLRL/CoMLRL/actions/workflows/ci.yml)
+[![Tests](https://github.com/OpenMLRL/CoMLRL/actions/workflows/ci.yml/badge.svg?label=tests)](https://github.com/OpenMLRL/CoMLRL/actions/workflows/ci.yml)
 [![pre-commit.ci](https://github.com/OpenMLRL/CoMLRL/actions/workflows/pre-commit.yml/badge.svg)](https://github.com/OpenMLRL/CoMLRL/actions/workflows/pre-commit.yml)
 [![Docs Build](https://github.com/OpenMLRL/CoMLRL/actions/workflows/docs-build.yml/badge.svg)](https://github.com/OpenMLRL/CoMLRL/actions/workflows/docs-build.yml)
 [![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?logo=black&logoColor=lightgrey)](https://github.com/psf/black)
@@ -122,7 +123,7 @@ Please cite the following papers if you find this library useful in your researc
 
 ```bibtex
 @inproceedings{liu2025llmcollabmarl,
-  title     = {{LLM} Collaboration With Multi-Agent Reinforcement Learning},
+  title     = {LLM Collaboration With Multi-Agent Reinforcement Learning},
   author    = {Liu, Shuo and Liang, Zeyu and Lyu, Xueguang and Amato, Christopher},
   booktitle = {Proceedings of the 40th Annual AAAI Conference on Artificial Intelligence},
   year      = {2026}

diff --git a/comlrl/trainers/actor_critic/ac_base.py b/comlrl/trainers/actor_critic/ac_base.py
@@ -39,7 +39,16 @@ def _resolve_model_sources(
         expected_label: Optional[str] = None,
     ) -> Tuple[List[Any], Optional[str]]:
         if model is not None and models is not None:
-            raise ValueError(f"Cannot provide both model and {kind}.")
+            is_name_list = (
+                isinstance(models, Sequence)
+                and not isinstance(models, (str, bytes))
+                and all(isinstance(src, str) for src in models)
+            )
+            if not is_name_list or len(models) != expected_count:
+                label = expected_label or f"num_agents ({expected_count})"
+                raise ValueError(
+                    f"Cannot provide both model and {kind} unless {kind} is a list of {label} model names."
+                )
         if model is None and models is None:
             raise ValueError(f"Either model or {kind} must be provided.")
         if expected_count < 1:

diff --git a/comlrl/trainers/actor_critic/iac.py b/comlrl/trainers/actor_critic/iac.py
@@ -136,8 +136,6 @@ def __init__(
             raise ValueError("A callable reward_func must be provided.")
         if model is None and agents is None:
             raise ValueError("Either model or agents must be provided.")
-        if model is not None and agents is not None:
-            raise ValueError("Cannot provide both model and agents parameters.")
 
         self.args = args if args is not None else IACConfig()
         if not self.args.use_separate_critic and critics is not None:
@@ -153,9 +151,6 @@ def __init__(
         self.critic_type = (self.args.critic_type or "v").lower()
 
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        if not torch.cuda.is_available():
-            # CPU fallback is allowed for experimentation but will be slow.
-            print("Warning: CUDA not available. Training will run on CPU.")
 
         self.agent_models: List[CausalLMWithValueHead] = []
         self.critic_models: List[Optional[CausalLMWithValueHead]] = []
@@ -191,7 +186,6 @@ def __init__(
             models=agents,
             expected_count=self.args.num_agents,
         )
-
         for actor_source in actor_sources:
             agent_model = self._load_agent_model(actor_source)
             agent_model.to(self.device)

diff --git a/comlrl/trainers/actor_critic/maac.py b/comlrl/trainers/actor_critic/maac.py
@@ -112,8 +112,6 @@ def __init__(
             raise ValueError("A callable reward_func must be provided.")
         if model is None and agents is None:
             raise ValueError("Either model or agents must be provided.")
-        if model is not None and agents is not None:
-            raise ValueError("Cannot provide both model and agents parameters.")
         self.args = args if args is not None else MAACConfig()
         self.reward_func = reward_func
         self.reward_processor = reward_processor or (lambda x: x)
@@ -123,8 +121,6 @@ def __init__(
         self.model_config = model_config or {}
 
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        if not torch.cuda.is_available():
-            print("Warning: CUDA not available. Training will run on CPU.")
 
         if agents is not None and tokenizer is None:
             raise ValueError("Tokenizer must be provided when agents are passed.")

diff --git a/comlrl/trainers/reinforce/magrpo.py b/comlrl/trainers/reinforce/magrpo.py
@@ -3,7 +3,7 @@
 import random
 from dataclasses import dataclass
 import itertools
-from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Type
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Type, Sequence
 
 import numpy as np
 import torch
@@ -134,16 +134,21 @@ def __init__(
         eval_aggregator: Optional[Callable] = None,
         args: Optional[MAGRPOConfig] = None,
     ):
-        # Check for GPU availability
-        if not torch.cuda.is_available():
-            raise RuntimeError(
-                "GPU not found. MAGRPOTrainer requires GPU for training."
-            )
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         if model is None and agents is None:
             raise ValueError("Either model or agents must be provided")
+        agents_is_name_list = (
+            agents is not None
+            and isinstance(agents, Sequence)
+            and not isinstance(agents, (str, bytes))
+            and all(isinstance(src, str) for src in agents)
+        )
         if model is not None and agents is not None:
-            raise ValueError("Cannot provide both model and agents parameters")
+            if not agents_is_name_list or len(agents) != num_agents:
+                raise ValueError(
+                    "Cannot provide both model and agents unless agents is a list of num_agents model names."
+                )
 
         self.args = args if args is not None else self.default_config_cls()
         self.env_step = 0
@@ -155,37 +160,51 @@ def __init__(
         self._setup_formatters(formatters, num_agents)
         self._setup_reward_function(reward_func, reward_processor)
 
+        self.model_config = model_config if model_config else {}
+        model_kwargs = {}
+        torch_dtype = None
+        if isinstance(self.model_config, dict):
+            torch_dtype = self.model_config.get("torch_dtype") or self.model_config.get(
+                "dtype"
+            )
+        if torch_dtype is not None:
+            model_kwargs["torch_dtype"] = torch_dtype
+
         if agents is not None:
-            self.agents = agents
-            self.num_agents = len(agents)
-            if (
-                hasattr(agents[0], "base_model")
-                and hasattr(agents[0].base_model, "config")
-                and hasattr(agents[0].base_model.config, "model_type")
-            ):
-                self.model_name = agents[0].base_model.config.model_type
-            elif hasattr(agents[0], "config") and hasattr(
-                agents[0].config, "_name_or_path"
-            ):
-                self.model_name = agents[0].config._name_or_path
-            else:
-                self.model_name = agents[0].__class__.__name__
+            if agents_is_name_list:
+                from transformers import AutoModelForCausalLM, AutoTokenizer
 
-            self.model_config = model_config if model_config else {}
+                self.agents = [
+                    AutoModelForCausalLM.from_pretrained(name, **model_kwargs)
+                    for name in agents
+                ]
+                self.num_agents = len(agents)
+                self.model_name = agents[0]
+                if tokenizer is None:
+                    self.tokenizer = AutoTokenizer.from_pretrained(agents[0])
+                    special_tokens = self.model_config.get("special_tokens", {})
+                    if special_tokens:
+                        self.tokenizer.add_special_tokens(special_tokens)
+            else:
+                self.agents = agents
+                self.num_agents = len(agents)
+                if (
+                    hasattr(agents[0], "base_model")
+                    and hasattr(agents[0].base_model, "config")
+                    and hasattr(agents[0].base_model.config, "model_type")
+                ):
+                    self.model_name = agents[0].base_model.config.model_type
+                elif hasattr(agents[0], "config") and hasattr(
+                    agents[0].config, "_name_or_path"
+                ):
+                    self.model_name = agents[0].config._name_or_path
+                else:
+                    self.model_name = agents[0].__class__.__name__
         else:
-            self.model_config = model_config if model_config else {}
             self.num_agents = num_agents
             if isinstance(model, str):
                 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-                model_kwargs = {}
-                torch_dtype = None
-                if isinstance(self.model_config, dict):
-                    torch_dtype = self.model_config.get(
-                        "torch_dtype"
-                    ) or self.model_config.get("dtype")
-                if torch_dtype is not None:
-                    model_kwargs["torch_dtype"] = torch_dtype
                 self.agents = [
                     AutoModelForCausalLM.from_pretrained(model, **model_kwargs)
                     for _ in range(num_agents)
@@ -704,7 +723,7 @@ def train(self, **kwargs):
         if self.wandb_config is not None and not self.wandb_initialized:
             self._init_wandb()
 
-        device = torch.device("cuda")
+        device = self.device
         for agent in self.agents:
             agent.to(device)
             agent.train()

diff --git a/docs/content/docs/user-guide/ac-finetuning.md b/docs/content/docs/user-guide/ac-finetuning.md
@@ -16,7 +16,6 @@ J(\theta_i) = \mathbb{E}_{o_{i,0} \sim \mathcal{D}, h_i \sim \pi_{\theta_i}}\lef
 
 where {{< katex inline=true >}}\delta_{i,t} = r_{i,t} + \gamma V_{\phi_i}(h_{i,t+1}) - V_{\phi_i}(h_{i,t}){{< /katex >}} is the (single-step) temporal difference error and {{< katex inline=true >}}\gamma{{< /katex >}} is the discount factor. Use `critic_type='q'` to switch to a Q-value critic {{< katex inline=true >}}Q(h_t, a_t){{< /katex >}}; the default is `critic_type='v'`.
 
-where {{< katex inline=true >}}\hat{V}_t{{< /katex >}} is the value target and {{< katex inline=true >}}\epsilon_v{{< /katex >}} corresponds to `value_clip_range`.
 
 CoMLRL supports two IAC architectures for critic implementation:
 
@@ -81,7 +80,7 @@ L(\phi_i) = \max\Big( (V_{\phi_i}(h_t) - \hat{V}_t)^2,\ (V_{\phi_i}^{\text{clip}
 {{% /hint %}}
 
 {{% hint warning %}}
-For simplicity, IAC computes the policy gradient using the current policy's samples without importance sampling or ratio clipping. Shared-critic mode (`use_separate_critic=false`) can be less stable; `value_clip_range` only applies in that mode.
+For simplicity, IAC computes the policy gradient using the current policy's samples without importance sampling or ratio clipping. In shared-critic mode (`use_separate_critic=false`), value heads are attached to the actor models (do not pass `critics`; passing them raises an error), and agents may be homogeneous or heterogeneous; this mode can be less stable, and `value_clip_range` only applies there. In separate-critic mode (`use_separate_critic=true`), pass a `critics` list with length equal to `num_agents`; critic models may differ from actor models.
 {{% /hint %}}
 
 {{% hint warning %}}