diff --git a/agent/model_metadata.py b/agent/model_metadata.py index d883263e6..afd8bee19 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -625,8 +625,6 @@ def fetch_endpoint_model_metadata( if isinstance(ctx, int) and ctx > 0: context_length = ctx break - if context_length is None: - context_length = _extract_context_length(model) if context_length is not None: entry["context_length"] = context_length @@ -1016,10 +1014,7 @@ def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> ctx = cfg.get("context_length") if ctx and isinstance(ctx, (int, float)): return int(ctx) - # Fall back to max_context_length (theoretical model max) - ctx = m.get("max_context_length") or m.get("context_length") - if ctx and isinstance(ctx, (int, float)): - return int(ctx) + break # LM Studio / vLLM / llama.cpp: try /v1/models/{model} resp = client.get(f"{server_url}/v1/models/{model}") diff --git a/run_agent.py b/run_agent.py index 65be5add9..6668cd543 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2145,9 +2145,25 @@ class AIAgent: if config_context_length is None: config_context_length = getattr(self, "_config_context_length", None) target_ctx = max(config_context_length or 0, MINIMUM_CONTEXT_LENGTH) - ensure_lmstudio_model_loaded( + loaded_ctx = ensure_lmstudio_model_loaded( self.model, self.base_url, getattr(self, "api_key", ""), target_ctx, ) + if loaded_ctx: + self._lmstudio_loaded_context = loaded_ctx + # Push into the live compressor so the status bar reflects the + # real loaded ctx the moment the load resolves, instead of + # holding the previous model's value (or "ctx --") through the + # next render tick. + cc = getattr(self, "context_compressor", None) + if cc is not None: + cc.update_model( + model=self.model, + context_length=loaded_ctx, + base_url=self.base_url, + api_key=getattr(self, "api_key", ""), + provider=self.provider, + api_mode=self.api_mode, + ) except Exception as err: logger.debug("LM Studio preload skipped: %s", err) diff --git a/tests/agent/test_model_metadata_local_ctx.py b/tests/agent/test_model_metadata_local_ctx.py index 5da1ed703..f449255c0 100644 --- a/tests/agent/test_model_metadata_local_ctx.py +++ b/tests/agent/test_model_metadata_local_ctx.py @@ -274,13 +274,15 @@ class TestQueryLocalContextLengthLmStudio: return client_mock def test_lmstudio_exact_key_match(self): - """Reads max_context_length when key matches exactly.""" + """Resolves loaded ctx when key matches exactly.""" from agent.model_metadata import _query_local_context_length native_resp = self._make_resp(200, { "models": [ - {"key": "nvidia/nvidia-nemotron-super-49b-v1", "id": "nvidia/nvidia-nemotron-super-49b-v1", - "max_context_length": 131072}, + {"key": "nvidia/nvidia-nemotron-super-49b-v1", + "id": "nvidia/nvidia-nemotron-super-49b-v1", + "max_context_length": 1_048_576, + "loaded_instances": [{"config": {"context_length": 131072}}]}, ] }) client_mock = self._make_client( @@ -310,7 +312,8 @@ class TestQueryLocalContextLengthLmStudio: "models": [ {"key": "nvidia/nvidia-nemotron-super-49b-v1", "id": "nvidia/nvidia-nemotron-super-49b-v1", - "max_context_length": 131072}, + "max_context_length": 1_048_576, + "loaded_instances": [{"config": {"context_length": 131072}}]}, ] }) client_mock = self._make_client( @@ -463,7 +466,10 @@ class TestFetchEndpointModelMetadataLmStudio: { "key": "lmstudio-community/Qwen3.5-27B-GGUF/Qwen3.5-27B-Q8_0.gguf", "id": "lmstudio-community/Qwen3.5-27B-GGUF/Qwen3.5-27B-Q8_0.gguf", - "max_context_length": 131072, + "max_context_length": 1_048_576, + "loaded_instances": [ + {"config": {"context_length": 131072}} + ], } ] }