Source code for yoker.tools.web_backend

"""Web search and fetch backend protocol and implementations.

Provides pluggable backend architecture for web search and web fetch.
"""

import logging
from typing import TYPE_CHECKING, Protocol

from .web_types import FetchedContent, SearchResult, WebFetchError, WebSearchError

if TYPE_CHECKING:
  from ollama import AsyncClient

logger = logging.getLogger(__name__)



[docs]
class WebSearchBackend(Protocol):
  """Protocol for web search backend implementations.

  Defines the interface that all search backends must implement.
  Supports async native tools.

  Implementations:
    - OllamaWebSearchBackend: Uses Ollama's native web_search function
  """


[docs]
  async def search(self, query: str, max_results: int = 10) -> list[SearchResult]:
    """Execute a web search and return results.

    Args:
      query: Search query string.
      max_results: Maximum number of results to return (1-50).

    Returns:
      List of SearchResult objects.

    Raises:
      WebSearchError: If search fails.
    """
    ...





[docs]
class OllamaWebSearchBackend:
  """Web search backend using Ollama's native web_search function.

  Uses the Ollama Python SDK's built-in web_search capability.
  Requires an authenticated AsyncClient for cloud-based web search.

  Features:
    - Native Ollama SDK integration
    - No model selection needed
    - Built-in result formatting

  Limitations:
    - Requires OLLAMA_API_KEY for cloud-based search
    - Limited to 10 results
    - No domain filtering on client side
  """

  def __init__(self, async_client: "AsyncClient", timeout_seconds: int = 30) -> None:
    """Initialize backend.

    Args:
      async_client: Authenticated Ollama AsyncClient instance.
      timeout_seconds: Request timeout in seconds.
    """
    self._client = async_client
    self._timeout_seconds = timeout_seconds
    self._backend_name = "ollama"


[docs]
  async def search(self, query: str, max_results: int = 10) -> list[SearchResult]:
    """Execute search via Ollama web_search function.

    Uses client.web_search() which returns structured results directly.

    Args:
      query: Search query string.
      max_results: Maximum results (capped at 10 for Ollama).

    Returns:
      List of SearchResult objects.

    Raises:
      WebSearchError: If Ollama request fails.
    """
    # Cap results at 10 (Ollama hard limit)
    capped_results = min(max_results, 10)

    try:
      # Use client's web_search method
      # Returns WebSearchResponse with .results attribute
      response = await self._client.web_search(query, max_results=capped_results)

      # Parse the response into SearchResult objects
      results: list[SearchResult] = []
      for item in response.results:
        results.append(
          SearchResult(
            title=str(item.title or ""),
            url=str(item.url or ""),
            snippet=str(item.content or ""),
            source=self._backend_name,
          )
        )
      # Slice to ensure we don't return more than requested
      return results[:capped_results]

    except ConnectionError as e:
      logger.error(f"Ollama connection error: {e}")
      raise WebSearchError(
        "Failed to connect to Ollama server",
        backend=self._backend_name,
        cause=e,
      ) from e
    except Exception as e:
      error_name = type(e).__name__
      if "Timeout" in error_name:
        logger.error(f"Ollama timeout: {e}")
        raise WebSearchError(
          f"Search timeout after {self._timeout_seconds}s",
          backend=self._backend_name,
          cause=e,
        ) from e
      if "Rate" in error_name or "429" in str(e):
        logger.error(f"Ollama rate limit: {e}")
        raise WebSearchError(
          "Rate limit exceeded, try again later",
          backend=self._backend_name,
          cause=e,
        ) from e
      logger.error(f"Ollama search error: {e}")
      raise WebSearchError(
        f"Search failed: {e}",
        backend=self._backend_name,
        cause=e,
      ) from e





[docs]
class WebFetchBackend(Protocol):
  """Protocol for web fetch backend implementations.

  Defines the interface that all fetch backends must implement.
  Supports async native tools.

  Implementations:
    - OllamaWebFetchBackend: Uses Ollama's native web_fetch function
  """


[docs]
  async def fetch(
    self,
    url: str,
    *,
    content_type: str = "markdown",
    max_size_kb: int = 2048,
    timeout_seconds: int = 30,
  ) -> FetchedContent:
    """Fetch content from a URL.

    Args:
      url: URL to fetch.
      content_type: Output format ("markdown", "text", "html").
      max_size_kb: Maximum content size in KB.
      timeout_seconds: Fetch timeout in seconds.

    Returns:
      FetchedContent with extracted content.

    Raises:
      WebFetchError: If fetch fails.
    """
    ...





[docs]
class OllamaWebFetchBackend:
  """Web fetch backend using Ollama's native web_fetch function.

  Uses the Ollama Python SDK's built-in web_fetch capability.
  Requires an authenticated AsyncClient for cloud-based fetch.

  Features:
    - Native Ollama SDK integration
    - Built-in content extraction and summarization
    - Configurable output format

  Limitations:
    - Requires OLLAMA_API_KEY for cloud-based fetch
    - Limited control over fetch process
    - Cannot enforce all client-side guardrails
  """

  def __init__(
    self,
    async_client: "AsyncClient",
    timeout_seconds: int = 30,
    max_size_kb: int = 2048,
  ) -> None:
    """Initialize backend.

    Args:
      async_client: Authenticated Ollama AsyncClient instance.
      timeout_seconds: Default fetch timeout in seconds.
      max_size_kb: Default maximum content size in KB.
    """
    self._client = async_client
    self._timeout_seconds = timeout_seconds
    self._max_size_kb = max_size_kb
    self._backend_name = "ollama"


[docs]
  async def fetch(
    self,
    url: str,
    *,
    content_type: str = "markdown",
    max_size_kb: int | None = None,
    timeout_seconds: int | None = None,
  ) -> FetchedContent:
    """Fetch content via Ollama web_fetch function.

    Uses client.web_fetch() which returns structured content.

    Args:
      url: URL to fetch.
      content_type: Output format (default "markdown").
      max_size_kb: Max content size (uses default if None).
      timeout_seconds: Timeout (uses default if None).

    Returns:
      FetchedContent with extracted content.

    Raises:
      WebFetchError: If Ollama request fails or content exceeds limits.
    """
    max_size = max_size_kb or self._max_size_kb
    timeout = timeout_seconds or self._timeout_seconds

    try:
      # Use client's web_fetch method
      # Returns WebFetchResponse with .content, .title attributes
      response = await self._client.web_fetch(url)

      # Extract content
      content = str(response.content or "")
      title = str(response.title or "")

      # Check size limit
      content_size_kb = len(content.encode("utf-8")) / 1024
      if content_size_kb > max_size:
        raise WebFetchError(
          f"Content size ({content_size_kb:.1f}KB) exceeds limit ({max_size}KB)",
          url=url,
          backend=self._backend_name,
          error_type="size_limit",
        )

      return FetchedContent(
        url=url,
        title=title,
        content=content,
        content_type=content_type,
        source=self._backend_name,
        metadata={"size_kb": content_size_kb},
      )

    except WebFetchError:
      raise
    except ConnectionError as e:
      raise WebFetchError(
        f"Failed to connect to Ollama server: {e}",
        url=url,
        backend=self._backend_name,
        cause=e,
        error_type="connection",
      ) from e
    except Exception as e:
      error_name = type(e).__name__
      if "Timeout" in error_name:
        raise WebFetchError(
          f"Fetch timeout after {timeout}s",
          url=url,
          backend=self._backend_name,
          cause=e,
          error_type="timeout",
        ) from e
      raise WebFetchError(
        f"Fetch failed: {e}",
        url=url,
        backend=self._backend_name,
        cause=e,
        error_type="unknown",
      ) from e




__all__ = [
  "WebSearchBackend",
  "OllamaWebSearchBackend",
  "WebFetchBackend",
  "OllamaWebFetchBackend",
]