Source code for yoker.tools.webfetch
"""WebFetchTool implementation.
Provides web content fetching capability using pluggable backends with comprehensive
security guardrails including SSRF protection.
"""
import logging
from typing import TYPE_CHECKING, Any
from .base import Tool, ToolResult
from .web_backend import WebFetchBackend
from .web_guardrail import WebGuardrail
from .web_types import WebFetchError
if TYPE_CHECKING:
pass
logger = logging.getLogger(__name__)
[docs]
class WebFetchTool(Tool):
"""Tool for fetching web content using pluggable backends.
Fetches content from URLs and returns structured results.
Uses a configurable backend (Ollama native or local httpx).
Validates URLs through WebGuardrail before execution.
Example:
tool = WebFetchTool(backend=OllamaWebFetchBackend(async_client))
result = await tool.execute(url="https://example.com", content_type="markdown")
"""
_guardrail: WebGuardrail | None
_backend: WebFetchBackend | None
def __init__(
self,
backend: WebFetchBackend | None = None,
guardrail: WebGuardrail | None = None,
) -> None:
"""Initialize WebFetchTool with optional backend and guardrail.
Args:
backend: Optional backend for web fetch (defaults to None).
guardrail: Optional guardrail for URL validation.
"""
super().__init__(guardrail=guardrail)
self._backend = backend
@property
def name(self) -> str:
"""Tool name used for registration."""
return "web_fetch"
@property
def description(self) -> str:
"""Tool description shown to the LLM."""
return "Fetch content from a web URL"
[docs]
def get_schema(self) -> dict[str, Any]:
"""Return Ollama-compatible schema.
Returns:
Schema with url, content_type, and max_size_kb parameters.
"""
return {
"type": "function",
"function": {
"name": self.name,
"description": self.description,
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL to fetch",
},
"content_type": {
"type": "string",
"description": "Output format (markdown, text, html)",
"enum": ["markdown", "text", "html"],
"default": "markdown",
},
"max_size_kb": {
"type": "integer",
"description": "Maximum content size in KB",
"default": 2048,
"minimum": 1,
"maximum": 10240,
},
},
"required": ["url"],
},
},
}
[docs]
async def execute(self, **kwargs: Any) -> ToolResult:
"""Execute web fetch with the given parameters asynchronously.
Steps:
1. Validate parameters via guardrail if provided.
2. Extract and validate URL parameter.
3. Delegate to backend for fetch execution.
4. Return structured results or error.
Args:
**kwargs: Must contain 'url', optionally 'content_type', 'max_size_kb'.
Returns:
ToolResult with FetchedContent dict or error.
"""
# Step 1: Extract URL parameter
url = kwargs.get("url", "")
if not url:
return ToolResult(
success=False,
result={},
error="URL is required",
)
# Strip whitespace and validate
url = url.strip()
if not url:
return ToolResult(
success=False,
result={},
error="URL cannot be empty or whitespace",
)
# Step 2: Extract optional parameters
content_type = kwargs.get("content_type", "markdown")
if content_type not in ("markdown", "text", "html"):
content_type = "markdown"
max_size_kb = kwargs.get("max_size_kb", 2048)
if not isinstance(max_size_kb, int):
try:
max_size_kb = int(max_size_kb)
except (ValueError, TypeError):
max_size_kb = 2048
max_size_kb = max(1, min(10240, max_size_kb))
# Step 3: Validate via guardrail
if self._guardrail:
validation = self._guardrail.validate_url(url)
if not validation.valid:
return ToolResult(
success=False,
result={},
error=validation.reason,
)
# Step 4: Check backend
if self._backend is None:
return ToolResult(
success=False,
result={},
error="No backend configured for web fetch",
)
# Step 5: Execute fetch
try:
content = await self._backend.fetch(
url=url,
content_type=content_type,
max_size_kb=max_size_kb,
)
return ToolResult(
success=True,
result=content.to_dict(),
)
except WebFetchError as e:
logger.error(f"Web fetch error: {e}")
return ToolResult(
success=False,
result={},
error=str(e),
)
except Exception as e:
logger.error(f"Unexpected error in web fetch: {e}")
return ToolResult(
success=False,
result={},
error=f"Fetch failed: {e}",
)
__all__ = [
"WebFetchTool",
]