diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c9844cf..af9b6e2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,3 +28,32 @@ jobs: - name: Test imports run: | python -c "import comlrl; print('✓ comlrl imported successfully')" + + unit-tests: + name: Unit Tests + runs-on: ubuntu-latest + env: + COMLRL_TEST_MODEL_NAME: Qwen/Qwen2.5-0.5B + COMLRL_TEST_MODEL_NAME_ALT: Qwen/Qwen3-0.6B + COMLRL_TEST_CRITIC_MODEL_NAME: Qwen/Qwen3-0.6B-Instruct + TOKENIZERS_PARALLELISM: "false" + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install --no-cache-dir setuptools-scm>=8.0.0 + pip install --no-cache-dir -r requirements.txt + pip install --no-cache-dir pytest + pip install --no-cache-dir -e . --no-deps + + - name: Run unit tests + run: | + pytest -q diff --git a/README.md b/README.md index 2925b77..2f89520 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ [![PyPI downloads](https://img.shields.io/pypi/dm/comlrl.svg?logo=data:image/svg+xml;base64,PHN2ZyB2aWV3Qm94PSIwIDAgMjQgMjQiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjYjliMWIxIj48ZyBpZD0iU1ZHUmVwb19iZ0NhcnJpZXIiIHN0cm9rZS13aWR0aD0iMCI+PC9nPjxnIGlkPSJTVkdSZXBvX3RyYWNlckNhcnJpZXIiIHN0cm9rZS1saW5lY2FwPSJyb3VuZCIgc3Ryb2tlLWxpbmVqb2luPSJyb3VuZCI+PC9nPjxnIGlkPSJTVkdSZXBvX2ljb25DYXJyaWVyIj4gPHBhdGggZD0iTTIyIDIwLjgyMDFDMTUuNDI2IDIyLjM5MiA4LjU3NCAyMi4zOTIgMiAyMC44MjAxIiBzdHJva2U9IiNiM2IyYjIiIHN0cm9rZS13aWR0aD0iMS41IiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiPjwvcGF0aD4gPHBhdGggZD0iTTExLjk0OTIgMlYxNiIgc3Ryb2tlPSIjYjNiMmIyIiBzdHJva2Utd2lkdGg9IjEuNSIgc3Ryb2tlLWxpbmVjYXA9InJvdW5kIiBzdHJva2UtbGluZWpvaW49InJvdW5kIj48L3BhdGg+IDxwYXRoIGQ9Ik0xNi44OTk2IDExLjhMMTMuMzc5NiAxNS40MDk5QzEzLjIwMTEgMTUuNTk3OCAxMi45ODYzIDE1Ljc0NzYgMTIuNzQ4MiAxNS44NDk5QzEyLjUxMDEgMTUuOTUyMSAxMi4yNTM4IDE2LjAwNDYgMTEuOTk0NiAxNi4wMDQ2QzExLjczNTUgMTYuMDA0NiAxMS40NzkxIDE1Ljk1MjEgMTEuMjQxIDE1Ljg0OTlDMTEuMDAyOSAxNS43NDc2IDEwLjc4ODEgMTUuNTk3OCAxMC42MDk2IDE1LjQwOTlMNy4wOTk2MSAxMS44IiBzdHJva2U9IiNiM2IyYjIiIHN0cm9rZS13aWR0aD0iMS41IiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiPjwvcGF0aD4gPC9nPjwvc3ZnPg==)](https://pypi.org/project/comlrl/) [![CI](https://github.com/OpenMLRL/CoMLRL/actions/workflows/ci.yml/badge.svg)](https://github.com/OpenMLRL/CoMLRL/actions/workflows/ci.yml) +[![Tests](https://github.com/OpenMLRL/CoMLRL/actions/workflows/ci.yml/badge.svg?label=tests)](https://github.com/OpenMLRL/CoMLRL/actions/workflows/ci.yml) [![pre-commit.ci](https://github.com/OpenMLRL/CoMLRL/actions/workflows/pre-commit.yml/badge.svg)](https://github.com/OpenMLRL/CoMLRL/actions/workflows/pre-commit.yml) [![Docs Build](https://github.com/OpenMLRL/CoMLRL/actions/workflows/docs-build.yml/badge.svg)](https://github.com/OpenMLRL/CoMLRL/actions/workflows/docs-build.yml) [![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?logo=black&logoColor=lightgrey)](https://github.com/psf/black) @@ -122,7 +123,7 @@ Please cite the following papers if you find this library useful in your researc ```bibtex @inproceedings{liu2025llmcollabmarl, - title = {{LLM} Collaboration With Multi-Agent Reinforcement Learning}, + title = {LLM Collaboration With Multi-Agent Reinforcement Learning}, author = {Liu, Shuo and Liang, Zeyu and Lyu, Xueguang and Amato, Christopher}, booktitle = {Proceedings of the 40th Annual AAAI Conference on Artificial Intelligence}, year = {2026} diff --git a/comlrl/trainers/actor_critic/ac_base.py b/comlrl/trainers/actor_critic/ac_base.py index cce94b1..48e3cb3 100644 --- a/comlrl/trainers/actor_critic/ac_base.py +++ b/comlrl/trainers/actor_critic/ac_base.py @@ -39,7 +39,16 @@ def _resolve_model_sources( expected_label: Optional[str] = None, ) -> Tuple[List[Any], Optional[str]]: if model is not None and models is not None: - raise ValueError(f"Cannot provide both model and {kind}.") + is_name_list = ( + isinstance(models, Sequence) + and not isinstance(models, (str, bytes)) + and all(isinstance(src, str) for src in models) + ) + if not is_name_list or len(models) != expected_count: + label = expected_label or f"num_agents ({expected_count})" + raise ValueError( + f"Cannot provide both model and {kind} unless {kind} is a list of {label} model names." + ) if model is None and models is None: raise ValueError(f"Either model or {kind} must be provided.") if expected_count < 1: diff --git a/comlrl/trainers/actor_critic/iac.py b/comlrl/trainers/actor_critic/iac.py index d80b708..731fb4e 100644 --- a/comlrl/trainers/actor_critic/iac.py +++ b/comlrl/trainers/actor_critic/iac.py @@ -136,8 +136,6 @@ def __init__( raise ValueError("A callable reward_func must be provided.") if model is None and agents is None: raise ValueError("Either model or agents must be provided.") - if model is not None and agents is not None: - raise ValueError("Cannot provide both model and agents parameters.") self.args = args if args is not None else IACConfig() if not self.args.use_separate_critic and critics is not None: @@ -153,9 +151,6 @@ def __init__( self.critic_type = (self.args.critic_type or "v").lower() self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - if not torch.cuda.is_available(): - # CPU fallback is allowed for experimentation but will be slow. - print("Warning: CUDA not available. Training will run on CPU.") self.agent_models: List[CausalLMWithValueHead] = [] self.critic_models: List[Optional[CausalLMWithValueHead]] = [] @@ -191,7 +186,6 @@ def __init__( models=agents, expected_count=self.args.num_agents, ) - for actor_source in actor_sources: agent_model = self._load_agent_model(actor_source) agent_model.to(self.device) diff --git a/comlrl/trainers/actor_critic/maac.py b/comlrl/trainers/actor_critic/maac.py index a8ce9e1..a9d027e 100644 --- a/comlrl/trainers/actor_critic/maac.py +++ b/comlrl/trainers/actor_critic/maac.py @@ -112,8 +112,6 @@ def __init__( raise ValueError("A callable reward_func must be provided.") if model is None and agents is None: raise ValueError("Either model or agents must be provided.") - if model is not None and agents is not None: - raise ValueError("Cannot provide both model and agents parameters.") self.args = args if args is not None else MAACConfig() self.reward_func = reward_func self.reward_processor = reward_processor or (lambda x: x) @@ -123,8 +121,6 @@ def __init__( self.model_config = model_config or {} self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - if not torch.cuda.is_available(): - print("Warning: CUDA not available. Training will run on CPU.") if agents is not None and tokenizer is None: raise ValueError("Tokenizer must be provided when agents are passed.") diff --git a/comlrl/trainers/reinforce/magrpo.py b/comlrl/trainers/reinforce/magrpo.py index 4ee8402..96a433a 100644 --- a/comlrl/trainers/reinforce/magrpo.py +++ b/comlrl/trainers/reinforce/magrpo.py @@ -3,7 +3,7 @@ import random from dataclasses import dataclass import itertools -from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Type +from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Type, Sequence import numpy as np import torch @@ -134,16 +134,21 @@ def __init__( eval_aggregator: Optional[Callable] = None, args: Optional[MAGRPOConfig] = None, ): - # Check for GPU availability - if not torch.cuda.is_available(): - raise RuntimeError( - "GPU not found. MAGRPOTrainer requires GPU for training." - ) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if model is None and agents is None: raise ValueError("Either model or agents must be provided") + agents_is_name_list = ( + agents is not None + and isinstance(agents, Sequence) + and not isinstance(agents, (str, bytes)) + and all(isinstance(src, str) for src in agents) + ) if model is not None and agents is not None: - raise ValueError("Cannot provide both model and agents parameters") + if not agents_is_name_list or len(agents) != num_agents: + raise ValueError( + "Cannot provide both model and agents unless agents is a list of num_agents model names." + ) self.args = args if args is not None else self.default_config_cls() self.env_step = 0 @@ -155,37 +160,51 @@ def __init__( self._setup_formatters(formatters, num_agents) self._setup_reward_function(reward_func, reward_processor) + self.model_config = model_config if model_config else {} + model_kwargs = {} + torch_dtype = None + if isinstance(self.model_config, dict): + torch_dtype = self.model_config.get("torch_dtype") or self.model_config.get( + "dtype" + ) + if torch_dtype is not None: + model_kwargs["torch_dtype"] = torch_dtype + if agents is not None: - self.agents = agents - self.num_agents = len(agents) - if ( - hasattr(agents[0], "base_model") - and hasattr(agents[0].base_model, "config") - and hasattr(agents[0].base_model.config, "model_type") - ): - self.model_name = agents[0].base_model.config.model_type - elif hasattr(agents[0], "config") and hasattr( - agents[0].config, "_name_or_path" - ): - self.model_name = agents[0].config._name_or_path - else: - self.model_name = agents[0].__class__.__name__ + if agents_is_name_list: + from transformers import AutoModelForCausalLM, AutoTokenizer - self.model_config = model_config if model_config else {} + self.agents = [ + AutoModelForCausalLM.from_pretrained(name, **model_kwargs) + for name in agents + ] + self.num_agents = len(agents) + self.model_name = agents[0] + if tokenizer is None: + self.tokenizer = AutoTokenizer.from_pretrained(agents[0]) + special_tokens = self.model_config.get("special_tokens", {}) + if special_tokens: + self.tokenizer.add_special_tokens(special_tokens) + else: + self.agents = agents + self.num_agents = len(agents) + if ( + hasattr(agents[0], "base_model") + and hasattr(agents[0].base_model, "config") + and hasattr(agents[0].base_model.config, "model_type") + ): + self.model_name = agents[0].base_model.config.model_type + elif hasattr(agents[0], "config") and hasattr( + agents[0].config, "_name_or_path" + ): + self.model_name = agents[0].config._name_or_path + else: + self.model_name = agents[0].__class__.__name__ else: - self.model_config = model_config if model_config else {} self.num_agents = num_agents if isinstance(model, str): from transformers import AutoModelForCausalLM, AutoTokenizer - model_kwargs = {} - torch_dtype = None - if isinstance(self.model_config, dict): - torch_dtype = self.model_config.get( - "torch_dtype" - ) or self.model_config.get("dtype") - if torch_dtype is not None: - model_kwargs["torch_dtype"] = torch_dtype self.agents = [ AutoModelForCausalLM.from_pretrained(model, **model_kwargs) for _ in range(num_agents) @@ -704,7 +723,7 @@ def train(self, **kwargs): if self.wandb_config is not None and not self.wandb_initialized: self._init_wandb() - device = torch.device("cuda") + device = self.device for agent in self.agents: agent.to(device) agent.train() diff --git a/docs/content/docs/user-guide/ac-finetuning.md b/docs/content/docs/user-guide/ac-finetuning.md index a171f4a..39cc9c7 100644 --- a/docs/content/docs/user-guide/ac-finetuning.md +++ b/docs/content/docs/user-guide/ac-finetuning.md @@ -16,7 +16,6 @@ J(\theta_i) = \mathbb{E}_{o_{i,0} \sim \mathcal{D}, h_i \sim \pi_{\theta_i}}\lef where {{< katex inline=true >}}\delta_{i,t} = r_{i,t} + \gamma V_{\phi_i}(h_{i,t+1}) - V_{\phi_i}(h_{i,t}){{< /katex >}} is the (single-step) temporal difference error and {{< katex inline=true >}}\gamma{{< /katex >}} is the discount factor. Use `critic_type='q'` to switch to a Q-value critic {{< katex inline=true >}}Q(h_t, a_t){{< /katex >}}; the default is `critic_type='v'`. -where {{< katex inline=true >}}\hat{V}_t{{< /katex >}} is the value target and {{< katex inline=true >}}\epsilon_v{{< /katex >}} corresponds to `value_clip_range`. CoMLRL supports two IAC architectures for critic implementation: @@ -81,7 +80,7 @@ L(\phi_i) = \max\Big( (V_{\phi_i}(h_t) - \hat{V}_t)^2,\ (V_{\phi_i}^{\text{clip} {{% /hint %}} {{% hint warning %}} -For simplicity, IAC computes the policy gradient using the current policy's samples without importance sampling or ratio clipping. Shared-critic mode (`use_separate_critic=false`) can be less stable; `value_clip_range` only applies in that mode. +For simplicity, IAC computes the policy gradient using the current policy's samples without importance sampling or ratio clipping. In shared-critic mode (`use_separate_critic=false`), value heads are attached to the actor models (do not pass `critics`; passing them raises an error), and agents may be homogeneous or heterogeneous; this mode can be less stable, and `value_clip_range` only applies there. In separate-critic mode (`use_separate_critic=true`), pass a `critics` list with length equal to `num_agents`; critic models may differ from actor models. {{% /hint %}} {{% hint warning %}} diff --git a/tests/test_model_loading.py b/tests/test_model_loading.py new file mode 100644 index 0000000..3e37d80 --- /dev/null +++ b/tests/test_model_loading.py @@ -0,0 +1,207 @@ +import gc +import os + +import pytest +from transformers import AutoModelForCausalLM, AutoTokenizer + +from comlrl.trainers.actor_critic import IACTrainer, MAACTrainer +from comlrl.trainers.actor_critic.iac import IACConfig +from comlrl.trainers.actor_critic.maac import MAACConfig +from comlrl.trainers.reinforce import MAGRPOTrainer +from comlrl.trainers.reinforce.magrpo import MAGRPOConfig + +MODEL_NAME_05 = os.getenv("COMLRL_TEST_MODEL_NAME", "Qwen/Qwen2.5-0.5B") +MODEL_NAME_06 = os.getenv("COMLRL_TEST_MODEL_NAME_ALT", "Qwen/Qwen3-0.6B") + + +def _reward_func(*_args, **_kwargs): + return [0.0] + + +@pytest.fixture(scope="session") +def tokenizer_05(): + return AutoTokenizer.from_pretrained(MODEL_NAME_05) + + +@pytest.fixture(scope="session") +def model_05(): + return AutoModelForCausalLM.from_pretrained(MODEL_NAME_05) + + +@pytest.fixture(scope="session") +def model_06(): + return AutoModelForCausalLM.from_pretrained(MODEL_NAME_06) + + +def _cleanup(*objs): + for obj in objs: + del obj + gc.collect() + + +def test_magrpo_model_name(): + args = MAGRPOConfig(num_agents=2, num_turns=1, num_generations=2) + trainer = MAGRPOTrainer( + model=MODEL_NAME_05, + num_agents=2, + reward_func=_reward_func, + args=args, + ) + assert trainer.num_agents == 2 + assert trainer.model_name == MODEL_NAME_05 + _cleanup(trainer) + + +def test_magrpo_pretrained(tokenizer_05, model_05, model_06): + args = MAGRPOConfig(num_agents=2, num_turns=1, num_generations=2) + trainer = MAGRPOTrainer( + agents=[model_05, model_06], + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + assert trainer.num_agents == 2 + assert trainer.model_name is not None + _cleanup(trainer) + + +def test_maac_model_name(): + args = MAACConfig(num_agents=2, num_turns=1) + trainer = MAACTrainer( + model=MODEL_NAME_05, + critics=[MODEL_NAME_06], + reward_func=_reward_func, + args=args, + ) + assert len(trainer.agent_models) == 2 + assert trainer.agent_model_name == MODEL_NAME_05 + assert trainer.critic_model_name == MODEL_NAME_06 + _cleanup(trainer) + + +def test_maac_pretrained(tokenizer_05, model_05, model_06): + args = MAACConfig(num_agents=2, num_turns=1) + trainer = MAACTrainer( + agents=[model_05, model_06], + critics=[model_06], + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + assert len(trainer.agent_models) == 2 + assert trainer.critic_model is not None + _cleanup(trainer) + + +def test_maac_critics_len_mismatch(tokenizer_05, model_05, model_06): + args = MAACConfig(num_agents=2, num_turns=1) + with pytest.raises(ValueError, match="critics length"): + MAACTrainer( + agents=[model_05, model_06], + critics=[model_05, model_06], + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + + +def test_iac_model_name_critics(tokenizer_05, model_06): + args = IACConfig(num_agents=2, num_turns=1, use_separate_critic=True) + trainer = IACTrainer( + model=MODEL_NAME_05, + critics=[model_06, model_06], + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + assert len(trainer.agent_models) == 2 + assert len(trainer.critic_models) == 2 + _cleanup(trainer) + + +def test_iac_model_and_agents_names(tokenizer_05): + args = IACConfig(num_agents=2, num_turns=1, use_separate_critic=False) + trainer = IACTrainer( + model=MODEL_NAME_05, + agents=[MODEL_NAME_05, MODEL_NAME_06], + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + assert len(trainer.agent_models) == 2 + _cleanup(trainer) + + +def test_iac_model_and_agents_len_mismatch(tokenizer_05): + args = IACConfig(num_agents=2, num_turns=1, use_separate_critic=False) + with pytest.raises(ValueError, match="both model and agents"): + IACTrainer( + model=MODEL_NAME_05, + agents=[MODEL_NAME_05], + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + + +@pytest.mark.parametrize( + "agents_case", + ["homo", "hetero"], + ids=["shared_homo", "shared_hetero"], +) +def test_iac_shared_heads(agents_case, tokenizer_05, model_05, model_06): + args = IACConfig(num_agents=2, num_turns=1, use_separate_critic=False) + agents = [model_05, model_05] if agents_case == "homo" else [model_05, model_06] + trainer = IACTrainer( + agents=agents, + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + assert len(trainer.agent_models) == 2 + assert all(c is None for c in trainer.critic_models) + _cleanup(trainer) + + +def test_iac_shared_heads_rejects_critics(tokenizer_05, model_05): + args = IACConfig(num_agents=2, num_turns=1, use_separate_critic=False) + with pytest.raises(ValueError, match="use_separate_critic"): + IACTrainer( + agents=[model_05, model_05], + critics=[model_05, model_05], + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + + +def test_iac_critics_len_mismatch(tokenizer_05, model_05): + args = IACConfig(num_agents=2, num_turns=1, use_separate_critic=True) + with pytest.raises(ValueError, match="critics length"): + IACTrainer( + model=MODEL_NAME_05, + critics=[model_05], + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + + +@pytest.mark.parametrize( + "critics_case", + ["match", "swapped"], + ids=["critic_match", "critic_swapped"], +) +def test_iac_separate_critics(critics_case, tokenizer_05, model_05, model_06): + args = IACConfig(num_agents=2, num_turns=1, use_separate_critic=True) + critics = [model_05, model_06] if critics_case == "match" else [model_06, model_05] + trainer = IACTrainer( + agents=[model_05, model_06], + critics=critics, + tokenizer=tokenizer_05, + reward_func=_reward_func, + args=args, + ) + assert len(trainer.agent_models) == 2 + assert len(trainer.critic_models) == 2 + _cleanup(trainer)