diff --git a/.github/workflows/scheduled-release.yml b/.github/workflows/scheduled-release.yml new file mode 100644 index 000000000..dbd1f79b5 --- /dev/null +++ b/.github/workflows/scheduled-release.yml @@ -0,0 +1,267 @@ +name: Scheduled Release + +on: + schedule: + # Every 2 weeks on Monday at 9 AM UTC + - cron: '0 9 * * 1/2' + workflow_dispatch: # Allow manual trigger + inputs: + skip_tests: + description: 'Skip LLM tests (use for testing workflow)' + required: false + default: false + type: boolean + dry_run: + description: 'Dry run - dont push changes or create release' + required: false + default: false + type: boolean + +jobs: + test-and-release: + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Setup UV + uses: astral-sh/setup-uv@v3 + + - name: Install dependencies + run: | + uv sync --dev + + - name: Run linting + run: | + uv run ruff check instructor examples tests + + - name: Run type checking + run: | + uv run pyright + + - name: Run core tests (no LLM) + run: | + uv run pytest tests/ -k "not openai and not llm and not anthropic and not gemini and not cohere and not mistral and not groq and not vertexai and not xai and not cerebras and not fireworks and not writer and not bedrock and not perplexity and not genai" --tb=short -v --maxfail=10 + + # Optional: Run LLM tests if you have API keys in secrets + - name: Run LLM tests + if: github.event.inputs.skip_tests != 'true' + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} + GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} + MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} + run: | + echo "Running basic LLM tests if API keys are available..." + # Run a subset of LLM tests to verify basic functionality + if [ ! -z "$OPENAI_API_KEY" ]; then + echo "Testing OpenAI integration..." + uv run pytest tests/llm/test_openai/test_basics.py --tb=short -v --maxfail=1 || echo "OpenAI tests failed" + fi + if [ ! -z "$ANTHROPIC_API_KEY" ]; then + echo "Testing Anthropic integration..." + uv run pytest tests/llm/test_anthropic/test_basics.py --tb=short -v --maxfail=1 || echo "Anthropic tests failed" + fi + echo "LLM tests completed (non-blocking)" + + - name: Check for changes since last release + id: changes + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "") + if [ -z "$LAST_TAG" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + echo "last_tag=none" >> $GITHUB_OUTPUT + echo "change_count=initial" >> $GITHUB_OUTPUT + else + CHANGES=$(git rev-list $LAST_TAG..HEAD --count) + echo "has_changes=$([[ $CHANGES -gt 0 ]] && echo true || echo false)" >> $GITHUB_OUTPUT + echo "change_count=$CHANGES" >> $GITHUB_OUTPUT + echo "last_tag=$LAST_TAG" >> $GITHUB_OUTPUT + fi + + echo "Last tag: $LAST_TAG" + echo "Changes since last tag: $(git rev-list $LAST_TAG..HEAD --count 2>/dev/null || echo 'N/A')" + + # Only proceed with release if tests passed AND there are changes + - name: Get current version + if: steps.changes.outputs.has_changes == 'true' + id: current_version + run: | + VERSION=$(uv run python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])") + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Current version: $VERSION" + + - name: Determine version bump type + if: steps.changes.outputs.has_changes == 'true' + id: version_type + run: | + # Check commit messages since last tag to determine bump type + LAST_TAG="${{ steps.changes.outputs.last_tag }}" + if [ "$LAST_TAG" = "none" ]; then + COMMITS=$(git log --oneline HEAD~20..HEAD) + else + COMMITS=$(git log --oneline $LAST_TAG..HEAD) + fi + + echo "Recent commits:" + echo "$COMMITS" + + # Look for breaking changes or major features + if echo "$COMMITS" | grep -qE "(BREAKING|feat!|fix!)"; then + echo "bump_type=minor" >> $GITHUB_OUTPUT + echo "Detected breaking changes - using minor bump" + elif echo "$COMMITS" | grep -qE "feat:"; then + echo "bump_type=minor" >> $GITHUB_OUTPUT + echo "Detected new features - using minor bump" + else + echo "bump_type=patch" >> $GITHUB_OUTPUT + echo "Using patch bump for bug fixes and chores" + fi + + - name: Bump version + if: steps.changes.outputs.has_changes == 'true' + id: bump_version + run: | + CURRENT="${{ steps.current_version.outputs.version }}" + BUMP_TYPE="${{ steps.version_type.outputs.bump_type }}" + + IFS='.' read -r major minor patch <<< "$CURRENT" + + case $BUMP_TYPE in + major) + major=$((major + 1)) + minor=0 + patch=0 + ;; + minor) + minor=$((minor + 1)) + patch=0 + ;; + patch) + patch=$((patch + 1)) + ;; + esac + + NEW_VERSION="$major.$minor.$patch" + echo "new_version=$NEW_VERSION" >> $GITHUB_OUTPUT + echo "Bumping from $CURRENT to $NEW_VERSION ($BUMP_TYPE)" + + # Update pyproject.toml + sed -i "s/version = \"$CURRENT\"/version = \"$NEW_VERSION\"/" pyproject.toml + + - name: Update lockfile + if: steps.changes.outputs.has_changes == 'true' + run: | + uv lock + + # Run tests again after version bump to make sure nothing broke + - name: Final test run + if: steps.changes.outputs.has_changes == 'true' + run: | + uv sync + uv run pytest tests/ -k "not openai and not llm and not anthropic and not gemini and not cohere and not mistral and not groq and not vertexai and not xai and not cerebras and not fireworks and not writer and not bedrock and not perplexity and not genai" --tb=short --maxfail=5 + + - name: Generate changelog + if: steps.changes.outputs.has_changes == 'true' + id: changelog + run: | + LAST_TAG="${{ steps.changes.outputs.last_tag }}" + NEW_VERSION="${{ steps.bump_version.outputs.new_version }}" + + if [ "$LAST_TAG" = "none" ]; then + CHANGELOG=$(git log --oneline HEAD~30..HEAD --pretty=format:"- %s" | head -20) + else + CHANGELOG=$(git log --oneline $LAST_TAG..HEAD --pretty=format:"- %s") + fi + + # Save changelog to file for GitHub release + cat > CHANGELOG.md << EOF + ## ๐Ÿš€ What's Changed + + $CHANGELOG + + ## ๐Ÿ”— Links + **Full Changelog**: https://github.com/${{ github.repository }}/compare/$LAST_TAG...v$NEW_VERSION + + --- + ๐Ÿค– *This release was automatically generated every 2 weeks* + EOF + + echo "changelog_file=CHANGELOG.md" >> $GITHUB_OUTPUT + + - name: Create release commit + if: steps.changes.outputs.has_changes == 'true' + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + git add pyproject.toml uv.lock + git commit -m "chore: automated release v${{ steps.bump_version.outputs.new_version }} + + ๐Ÿค– Generated with [Claude Code](https://claude.ai/code) + + Co-Authored-By: GitHub Action " + git tag "v${{ steps.bump_version.outputs.new_version }}" + + - name: Push changes + if: steps.changes.outputs.has_changes == 'true' && github.event.inputs.dry_run != 'true' + run: | + git push origin main + git push origin "v${{ steps.bump_version.outputs.new_version }}" + + - name: Create GitHub Release + if: steps.changes.outputs.has_changes == 'true' && github.event.inputs.dry_run != 'true' + uses: ncipollo/release-action@v1 + with: + tag: "v${{ steps.bump_version.outputs.new_version }}" + name: "๐Ÿš€ Release v${{ steps.bump_version.outputs.new_version }}" + bodyFile: "CHANGELOG.md" + draft: false + prerelease: false + + - name: Dry run summary + if: steps.changes.outputs.has_changes == 'true' && github.event.inputs.dry_run == 'true' + run: | + echo "๐Ÿงช DRY RUN MODE - No changes pushed" + echo "Would have released: v${{ steps.bump_version.outputs.new_version }}" + cat CHANGELOG.md + + # Optional: Publish to PyPI (uncomment if you want automatic PyPI releases) + # - name: Build and publish to PyPI + # if: steps.changes.outputs.has_changes == 'true' && secrets.PYPI_TOKEN != '' + # env: + # PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + # run: | + # uv build + # uv publish --token $PYPI_TOKEN + + # Summary outputs + - name: Summary + if: always() + run: | + echo "## ๐Ÿ“Š Scheduled Release Summary" >> $GITHUB_STEP_SUMMARY + echo "- **Branch**: ${{ github.ref }}" >> $GITHUB_STEP_SUMMARY + echo "- **Has Changes**: ${{ steps.changes.outputs.has_changes }}" >> $GITHUB_STEP_SUMMARY + echo "- **Change Count**: ${{ steps.changes.outputs.change_count }}" >> $GITHUB_STEP_SUMMARY + if [ "${{ steps.changes.outputs.has_changes }}" = "true" ]; then + echo "- **Version**: ${{ steps.current_version.outputs.version }} โ†’ ${{ steps.bump_version.outputs.new_version }}" >> $GITHUB_STEP_SUMMARY + echo "- **Bump Type**: ${{ steps.version_type.outputs.bump_type }}" >> $GITHUB_STEP_SUMMARY + echo "- **Status**: โœ… Released" >> $GITHUB_STEP_SUMMARY + else + echo "- **Status**: โญ๏ธ Skipped (no changes)" >> $GITHUB_STEP_SUMMARY + fi + + - name: Notify on failure + if: failure() + run: | + echo "โŒ Scheduled release failed - check the logs above" + echo "Common issues:" + echo "- Tests failed" + echo "- Linting issues" + echo "- Type checking errors" + echo "- Git push permissions" \ No newline at end of file diff --git a/docs/concepts/multimodal.md b/docs/concepts/multimodal.md index d22ba38c3..d63220b6a 100644 --- a/docs/concepts/multimodal.md +++ b/docs/concepts/multimodal.md @@ -14,7 +14,7 @@ description: Learn how the Image, PDF and Audio class in Instructor enables seam Instructor provides a unified, provider-agnostic interface for working with multimodal inputs like images, PDFs, and audio files. -With Instructor's multimodal objects, you can easily load media from URLs, local files, or base64 strings using a consistent API that works across different AI providers (OpenAI, Anthropic, Mistral, etc.). +With Instructor's multimodal objects, you can easily load media from URLs, Google Cloud Storage URLs, local files, or base64 strings using a consistent API that works across different AI providers (OpenAI, Anthropic, Mistral, etc.). Instructor handles all the provider-specific formatting requirements behind the scenes, ensuring your code remains clean and future-proof as provider APIs evolve. Let's see how to use the Image, Audio and PDF classes. @@ -22,12 +22,13 @@ Instructor handles all the provider-specific formatting requirements behind the This class represents an image that can be loaded from a URL or file path. It provides a set of methods to create `Image` instances from different sources (Eg. URLs, paths and base64 strings). The following shows which methods are supported for the individual providers. -| Method | OpenAI | Anthropic | Google GenAI | -| --------------- | ------ | --------- | ------------ | -| `from_url()` | โœ… | โœ… | โœ… | -| `from_path()` | โœ… | โœ… | โœ… | -| `from_base64()` | โœ… | โœ… | โœ… | -| `autodetect()` | โœ… | โœ… | โœ… | +| Method | OpenAI | Anthropic | Google GenAI | +| ----------------- | ------ | --------- | ------------ | +| `from_url()` | โœ… | โœ… | โœ… | +| `from_gs_url()` | โœ… | โœ… | โœ… | +| `from_path()` | โœ… | โœ… | โœ… | +| `from_base64()` | โœ… | โœ… | โœ… | +| `autodetect()` | โœ… | โœ… | โœ… | We also support Anthropic Prompt Caching for images with the `ImageWith @@ -35,7 +36,7 @@ We also support Anthropic Prompt Caching for images with the `ImageWith By using the `Image` class, we can abstract away the differences between the different formats, allowing you to work with a unified interface. -You can create an `Image` instance from a URL or file path using the `from_url` or `from_path` methods. The `Image` class will automatically convert the image to a base64-encoded string and include it in the API request. +You can create an `Image` instance from a URL, Google Cloud Storage (GCS) URL, or file path using the `from_url`, `from_gs_url`, or `from_path` methods. The `Image` class will automatically convert the image to a base64-encoded string and include it in the API request. ```python import instructor @@ -71,7 +72,45 @@ print(response) # > description='A bush with numerous clusters of blueberries surrounded by green leaves, under a cloudy sky.' items=['blueberries', 'green leaves', 'cloudy sky'] ``` -We also provide a `autodetect_image` keyword argument that allows you to provide URLs or file paths as normal strings when you set it to true. +### Google Cloud Storage Support + +Instructor now supports loading images directly from Google Cloud Storage URLs. This is particularly useful when working with images stored in GCS buckets. + +```python +import instructor +from instructor.processing.multimodal import Image +from pydantic import BaseModel + + +class ImageDescription(BaseModel): + description: str + items: list[str] + + +# Load image from GCS URL (must be publicly accessible) +gs_url = "gs://my-bucket/path/to/image.jpg" + +client = instructor.from_provider("openai/gpt-4.1-mini") + +response = client.chat.completions.create( + response_model=ImageDescription, + messages=[ + { + "role": "user", + "content": [ + "What is in this image?", + Image.from_gs_url(gs_url), + ], + } + ], +) + +print(response) +``` + +> **Note**: GCS URLs must point to publicly accessible objects. The `from_gs_url` method converts `gs://` URLs to `https://storage.googleapis.com/` URLs for access. + +We also provide an `autodetect_images` keyword argument that allows you to provide URLs, GCS URLs, or file paths as normal strings when you set it to true. The system will automatically detect and handle different media types including images, audio, and PDFs. You can see an example below. @@ -163,14 +202,17 @@ By leveraging Instructor's multimodal capabilities, you can focus on building yo Similar to the Image class, we provide methods to create `Audio` instances. -| Method | OpenAI | Google GenAI | -| ------------- | ------ | ------------ | -| `from_url()` | โœ… | โœ… | -| `from_path()` | โœ… | โœ… | +| Method | OpenAI | Google GenAI | +| --------------- | ------ | ------------ | +| `from_url()` | โœ… | โœ… | +| `from_gs_url()` | โœ… | โœ… | +| `from_path()` | โœ… | โœ… | +| `from_base64()` | โœ… | โœ… | +| `autodetect()` | โœ… | โœ… | -The `Audio` class represents an audio file that can be loaded from a URL or file path. It provides methods to create `Audio` instances using the `from_path` and `from_url` methods. +The `Audio` class represents an audio file that can be loaded from a URL, Google Cloud Storage URL, or file path. It provides methods to create `Audio` instances using the `from_path`, `from_url`, `from_gs_url`, `from_base64`, and `autodetect` methods. -The `Audio` class will automatically convert it to a the right format and include it in the API request. +The `Audio` class will automatically convert it to the right format and include it in the API request. ```python from openai import OpenAI @@ -210,18 +252,59 @@ resp = client.chat.completions.create( print(resp) ``` +### Google Cloud Storage Support + +You can also load audio files directly from Google Cloud Storage: + +```python +from openai import OpenAI +from pydantic import BaseModel +import instructor +from instructor.processing.multimodal import Audio + +# Initialize the client +client = instructor.from_provider("openai/gpt-4o-audio-preview") + +# Define our response model +class AudioDescription(BaseModel): + summary: str + transcript: str + +# Load audio from GCS URL (must be publicly accessible) +gs_url = "gs://my-bucket/path/to/audio.wav" + +# Make the API call with the GCS audio file +resp = client.chat.completions.create( + response_model=AudioDescription, + modalities=["text"], + audio={"voice": "alloy", "format": "wav"}, + messages=[ + { + "role": "user", + "content": [ + "Extract the following information from the audio:", + Audio.from_gs_url(gs_url), + ], + }, + ], +) + +print(resp) +``` + ## `PDF` The `PDF` class represents a PDF file that can be loaded from a URL or file path. It provides methods to create `PDF` instances and is currently supported for OpenAI, Mistral, GenAI and Anthropic client integrations. -| Method | OpenAI | Anthropic | Google GenAI | Mistral | -| --------------- | ------ | --------- | ------------ | ------- | -| `from_url()` | โœ… | โœ… | โœ… | โœ… | -| `from_path()` | โœ… | โœ… | โœ… | โŽ | -| `from_base64()` | โœ… | โœ… | โœ… | โŽ | -| `autodetect()` | โœ… | โœ… | โœ… | โœ… | +| Method | OpenAI | Anthropic | Google GenAI | Mistral | +| ----------------- | ------ | --------- | ------------ | ------- | +| `from_url()` | โœ… | โœ… | โœ… | โœ… | +| `from_gs_url()` | โœ… | โœ… | โœ… | โœ… | +| `from_path()` | โœ… | โœ… | โœ… | โŽ | +| `from_base64()` | โœ… | โœ… | โœ… | โŽ | +| `autodetect()` | โœ… | โœ… | โœ… | โœ… | For Gemini, we also provide two additional methods that make working with the google-genai files package easy which you can access in the `PDFWithGenaiFile` object. @@ -266,6 +349,43 @@ We provide examples of how to use all three object classes below. # > Total = 220, items = ['English Tea', 'Tofu'] ``` +### Google Cloud Storage Support + +You can load PDF files directly from Google Cloud Storage URLs: + +```python +from openai import OpenAI +import instructor +from pydantic import BaseModel +from instructor.processing.multimodal import PDF + +# Set up the client +gs_url = "gs://my-bucket/path/to/document.pdf" +client = instructor.from_provider("openai/gpt-4.1-mini") + +# Create a model for analyzing PDFs +class Invoice(BaseModel): + total: float + items: list[str] + +# Load and analyze a PDF from GCS (must be publicly accessible) +response = client.chat.completions.create( + response_model=Invoice, + messages=[ + { + "role": "user", + "content": [ + "Analyze this document", + PDF.from_gs_url(gs_url), + ], + } + ], +) + +print(response) +# > Total = 220, items = ['English Tea', 'Tofu'] +``` + ### Caching If you'd like to cache the PDF for Anthropic, we provide the `PDFWithCacheControl` class which has caching configured by default. diff --git a/instructor/exceptions.py b/instructor/exceptions.py new file mode 100644 index 000000000..ac0303a51 --- /dev/null +++ b/instructor/exceptions.py @@ -0,0 +1,42 @@ +"""Backward compatibility module for instructor.exceptions imports. + +.. deprecated:: 1.11.0 + This module is deprecated. Import exceptions from `instructor.core` instead. + For example: `from instructor.core import InstructorRetryException` +""" + +import warnings + +# Show deprecation warning when this module is imported +warnings.warn( + "Importing from 'instructor.exceptions' is deprecated and will be removed in a future version. " + "Please import from 'instructor.core' instead. " + "For example: 'from instructor.core import InstructorRetryException'", + DeprecationWarning, + stacklevel=2, +) + +# Explicit re-exports for better IDE support and clarity +from .core.exceptions import ( + AsyncValidationError, + ClientError, + ConfigurationError, + IncompleteOutputException, + InstructorError, + InstructorRetryException, + ModeError, + ProviderError, + ValidationError, +) + +__all__ = [ + "AsyncValidationError", + "ClientError", + "ConfigurationError", + "IncompleteOutputException", + "InstructorError", + "InstructorRetryException", + "ModeError", + "ProviderError", + "ValidationError", +] diff --git a/instructor/processing/multimodal.py b/instructor/processing/multimodal.py index 261dddcef..ab9126ce6 100644 --- a/instructor/processing/multimodal.py +++ b/instructor/processing/multimodal.py @@ -87,8 +87,6 @@ def autodetect(cls, source: str | Path) -> Image: if isinstance(source, Path): return cls.from_path(source) - raise ValueError("Unable to determine image type or unsupported image format") - @classmethod def autodetect_safely(cls, source: Union[str, Path]) -> Union[Image, str]: # noqa: UP007 """Safely attempt to autodetect an image from a source string or path. @@ -121,9 +119,13 @@ def from_base64(cls, data_uri: str) -> Image: ) @classmethod - def from_gs_url(cls, data_uri: str) -> Image: + def from_gs_url(cls, data_uri: str, timeout: int = 30) -> Image: """ Create an Image instance from a Google Cloud Storage URL. + + Args: + data_uri: GCS URL starting with gs:// + timeout: Request timeout in seconds (default: 30) """ if not data_uri.startswith("gs://"): raise ValueError("URL must start with gs://") @@ -131,7 +133,7 @@ def from_gs_url(cls, data_uri: str) -> Image: public_url = f"https://storage.googleapis.com/{data_uri[5:]}" try: - response = requests.get(public_url) + response = requests.get(public_url, timeout=timeout) response.raise_for_status() media_type = response.headers.get("Content-Type") if media_type not in VALID_MIME_TYPES: @@ -141,7 +143,9 @@ def from_gs_url(cls, data_uri: str) -> Image: return cls(source=data_uri, media_type=media_type, data=data) except requests.RequestException as e: - raise ValueError(f"We only support public images for now") from e + raise ValueError( + "Failed to access GCS image (must be publicly readable)" + ) from e @classmethod # Caching likely unnecessary def from_raw_base64(cls, data: str) -> Image: @@ -165,6 +169,8 @@ def from_raw_base64(cls, data: str) -> Image: @classmethod @lru_cache def from_url(cls, url: str) -> Image: + if url.startswith("gs://"): + return cls.from_gs_url(url) if cls.is_base64(url): return cls.from_base64(url) @@ -301,9 +307,66 @@ class Audio(BaseModel): ) media_type: str = Field(description="MIME type of the audio") + @classmethod + def autodetect(cls, source: str | Path) -> Audio: + """Attempt to autodetect an audio from a source string or Path.""" + if isinstance(source, str): + if cls.is_base64(source): + return cls.from_base64(source) + if source.startswith(("http://", "https://")): + return cls.from_url(source) + if source.startswith("gs://"): + return cls.from_gs_url(source) + # Since detecting the max length of a file universally cross-platform is difficult, + # we'll just try/catch the Path conversion and file check + try: + path = Path(source) + if path.is_file(): + return cls.from_path(path) + except OSError: + pass # Fall through to error + + raise ValueError("Unable to determine audio source") + + if isinstance(source, Path): + return cls.from_path(source) + + @classmethod + def autodetect_safely(cls, source: Union[str, Path]) -> Union[Audio, str]: # noqa: UP007 + """Safely attempt to autodetect an audio from a source string or path. + + Args: + source (Union[str,path]): The source string or path. + Returns: + An Audio if the source is detected to be a valid audio, otherwise + the source itself as a string. + """ + try: + return cls.autodetect(source) + except ValueError: + return str(source) + + @classmethod + def is_base64(cls, s: str) -> bool: + return bool(re.match(r"^data:audio/[a-zA-Z0-9+-]+;base64,", s)) + + @classmethod + def from_base64(cls, data_uri: str) -> Audio: + header, encoded = data_uri.split(",", 1) + media_type = header.split(":")[1].split(";")[0] + if media_type not in VALID_AUDIO_MIME_TYPES: + raise ValueError(f"Unsupported audio format: {media_type}") + return cls( + source=data_uri, + media_type=media_type, + data=encoded, + ) + @classmethod def from_url(cls, url: str) -> Audio: """Create an Audio instance from a URL.""" + if url.startswith("gs://"): + return cls.from_gs_url(url) response = requests.get(url) content_type = response.headers.get("content-type") assert content_type in VALID_AUDIO_MIME_TYPES, ( @@ -336,6 +399,35 @@ def from_path(cls, path: Union[str, Path]) -> Audio: # noqa: UP007 data = base64.b64encode(path.read_bytes()).decode("utf-8") return cls(source=str(path), data=data, media_type=mime_type) + @classmethod + def from_gs_url(cls, data_uri: str, timeout: int = 30) -> Audio: + """ + Create an Audio instance from a Google Cloud Storage URL. + + Args: + data_uri: GCS URL starting with gs:// + timeout: Request timeout in seconds (default: 30) + """ + if not data_uri.startswith("gs://"): + raise ValueError("URL must start with gs://") + + public_url = f"https://storage.googleapis.com/{data_uri[5:]}" + + try: + response = requests.get(public_url, timeout=timeout) + response.raise_for_status() + media_type = response.headers.get("Content-Type") + if media_type not in VALID_AUDIO_MIME_TYPES: + raise ValueError(f"Unsupported audio format: {media_type}") + + data = base64.b64encode(response.content).decode("utf-8") + + return cls(source=data_uri, media_type=media_type, data=data) + except requests.RequestException as e: + raise ValueError( + "Failed to access GCS audio (must be publicly readable)" + ) from e + def to_openai(self, mode: Mode) -> dict[str, Any]: """Convert the Audio instance to OpenAI's API format.""" if mode in {Mode.RESPONSES_TOOLS, Mode.RESPONSES_TOOLS_WITH_INBUILT_TOOLS}: @@ -415,6 +507,8 @@ def autodetect(cls, source: str | Path) -> PDF: return cls.from_base64(source) elif source.startswith(("http://", "https://")): return cls.from_url(source) + elif source.startswith("gs://"): + return cls.from_gs_url(source) try: if Path(source).is_file(): @@ -430,7 +524,20 @@ def autodetect(cls, source: str | Path) -> PDF: elif isinstance(source, Path): return cls.from_path(source) - raise ValueError("Unable to determine PDF type or unsupported PDF format") + @classmethod + def autodetect_safely(cls, source: Union[str, Path]) -> Union[PDF, str]: # noqa: UP007 + """Safely attempt to autodetect a PDF from a source string or path. + + Args: + source (Union[str,path]): The source string or path. + Returns: + A PDF if the source is detected to be a valid PDF, otherwise + the source itself as a string. + """ + try: + return cls.autodetect(source) + except ValueError: + return str(source) @classmethod def is_base64(cls, s: str) -> bool: @@ -480,9 +587,40 @@ def from_raw_base64(cls, data: str) -> PDF: except Exception as e: raise ValueError("Invalid or unsupported base64 PDF data") from e + @classmethod + def from_gs_url(cls, data_uri: str, timeout: int = 30) -> PDF: + """ + Create a PDF instance from a Google Cloud Storage URL. + + Args: + data_uri: GCS URL starting with gs:// + timeout: Request timeout in seconds (default: 30) + """ + if not data_uri.startswith("gs://"): + raise ValueError("URL must start with gs://") + + public_url = f"https://storage.googleapis.com/{data_uri[5:]}" + + try: + response = requests.get(public_url, timeout=timeout) + response.raise_for_status() + media_type = response.headers.get("Content-Type", "application/pdf") + if media_type not in VALID_PDF_MIME_TYPES: + raise ValueError(f"Unsupported PDF format: {media_type}") + + data = base64.b64encode(response.content).decode("utf-8") + + return cls(source=data_uri, media_type=media_type, data=data) + except requests.RequestException as e: + raise ValueError( + "Failed to access GCS PDF (must be publicly readable)" + ) from e + @classmethod @lru_cache def from_url(cls, url: str) -> PDF: + if url.startswith("gs://"): + return cls.from_gs_url(url) parsed_url = urlparse(url) media_type, _ = mimetypes.guess_type(parsed_url.path) @@ -741,6 +879,46 @@ def convert_contents( return converted_contents +def autodetect_media( + source: str | Path | Image | Audio | PDF, +) -> Image | Audio | PDF | str: + """Autodetect images, audio, or PDFs from a given source. + + Args: + source: URL, file path, Path, or data URI to inspect. + + Returns: + The detected :class:`Image`, :class:`Audio`, or :class:`PDF` instance. + If detection fails, the original source is returned. + """ + if isinstance(source, (Image, Audio, PDF)): + return source + + # Normalize once for cheap checks and mimetype guess + source = str(source) + + if source.startswith("data:image/"): + return Image.autodetect_safely(source) + if source.startswith("data:audio/"): + return Audio.autodetect_safely(source) + if source.startswith("data:application/pdf"): + return PDF.autodetect_safely(source) + + media_type, _ = mimetypes.guess_type(source) + if media_type in VALID_MIME_TYPES: + return Image.autodetect_safely(source) + if media_type in VALID_AUDIO_MIME_TYPES: + return Audio.autodetect_safely(source) + if media_type in VALID_PDF_MIME_TYPES: + return PDF.autodetect_safely(source) + + for cls in (Image, Audio, PDF): + item = cls.autodetect_safely(source) # type: ignore[arg-type] + if not isinstance(item, str): + return item + return source + + def convert_messages( messages: list[ dict[ @@ -750,7 +928,8 @@ def convert_messages( dict[str, Any], Image, Audio, - list[Union[str, dict[str, Any], Image, Audio]], # noqa: UP007 + PDF, + list[Union[str, dict[str, Any], Image, Audio, PDF]], # noqa: UP007 ], ] ], @@ -776,10 +955,10 @@ def is_image_params(x: Any) -> bool: } if autodetect_images: if isinstance(content, list): - new_content: list[str | dict[str, Any] | Image | Audio] = [] # noqa: UP007 + new_content: list[str | dict[str, Any] | Image | Audio | PDF] = [] # noqa: UP007 for item in content: if isinstance(item, str): - new_content.append(Image.autodetect_safely(item)) + new_content.append(autodetect_media(item)) elif is_image_params(item): new_content.append( ImageWithCacheControl.from_image_params( @@ -790,7 +969,7 @@ def is_image_params(x: Any) -> bool: new_content.append(item) content = new_content elif isinstance(content, str): - content = Image.autodetect_safely(content) + content = autodetect_media(content) elif is_image_params(content): content = ImageWithCacheControl.from_image_params( cast(ImageParams, content) @@ -838,15 +1017,12 @@ def extract_genai_multimodal_content( # Now we need to support a few cases for content_part in content.parts: if content_part.text and autodetect_images: - # Detect if the text is an image - converted_item = Image.autodetect_safely(content_part.text) + converted_item = autodetect_media(content_part.text) - # We only do autodetection for images for now - if isinstance(converted_item, Image): + if isinstance(converted_item, (Image, Audio, PDF)): converted_contents.append(converted_item.to_genai()) continue - # If it's not an image or audio, we just return the text converted_contents.append(content_part) else: converted_contents.append(content_part) diff --git a/pyproject.toml b/pyproject.toml index b08967123..849c74efe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ dependencies = [ "diskcache>=5.6.3", ] name = "instructor" -version = "1.11.1" +version = "1.11.2" description = "structured outputs for llm" readme = "README.md" diff --git a/tests/llm/test_gemini/test_multimodal_content.py b/tests/llm/test_gemini/test_multimodal_content.py index 30992fc9a..2a396b3c8 100644 --- a/tests/llm/test_gemini/test_multimodal_content.py +++ b/tests/llm/test_gemini/test_multimodal_content.py @@ -14,7 +14,7 @@ class Description(BaseModel): def test_audio_compatability_list(): client = instructor.from_provider( - model="google/gemini-1.5-flash-latest", mode=instructor.Mode.GEMINI_JSON + model="google/gemini-2.5-flash", mode=instructor.Mode.GEMINI_JSON ) # For now, we'll skip file operations since the new API might handle them differently @@ -35,7 +35,7 @@ def test_audio_compatability_list(): def test_audio_compatability_multiple_messages(): client = instructor.from_provider( - model="google/gemini-1.5-flash-latest", mode=instructor.Mode.GEMINI_JSON + model="google/gemini-2.5-flash", mode=instructor.Mode.GEMINI_JSON ) # For now, we'll skip file operations since the new API might handle them differently diff --git a/tests/test_multimodal.py b/tests/test_multimodal.py index 0c0492b02..b447a32e7 100644 --- a/tests/test_multimodal.py +++ b/tests/test_multimodal.py @@ -1,6 +1,13 @@ import pytest from pathlib import Path -from instructor.processing.multimodal import Image, convert_contents, convert_messages +from instructor.processing.multimodal import ( + PDF, + Audio, + Image, + autodetect_media, + convert_contents, + convert_messages, +) from instructor.mode import Mode from unittest.mock import patch, MagicMock import instructor @@ -374,3 +381,149 @@ def test_raw_base64_autodetect_png(base64_png): image = Image.autodetect(raw_base_64) assert image.media_type == "image/png" assert image.source == image.data == raw_base_64 + + +def test_autodetect_media_data_uris(): + img_uri = ( + "data:image/png;base64," + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII=" + ) + pdf_uri = "data:application/pdf;base64,JVBERi0xLjQK" # "%PDF-1.4\n" + aud_uri = "data:audio/wav;base64,UklGRiQAAABXQVZF" # minimal header-ish + + img = autodetect_media(img_uri) + pdf = autodetect_media(pdf_uri) + aud = autodetect_media(aud_uri) + + assert isinstance(img, Image) + assert img.media_type == "image/png" + + assert isinstance(pdf, PDF) + assert pdf.media_type == "application/pdf" + + assert isinstance(aud, Audio) + assert aud.media_type == "audio/wav" + + +def test_convert_messages_autodetect_media(): + img_uri = ( + "data:image/png;base64," + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgYAAAAAMAASsJTYQAAAAASUVORK5CYII=" + ) + pdf_uri = "data:application/pdf;base64,JVBERi0xLjQK" + + messages = [ + {"role": "user", "content": ["hello", img_uri, pdf_uri]}, + ] + + out = convert_messages(messages, mode=Mode.RESPONSES_TOOLS, autodetect_images=True) + assert isinstance(out, list) and len(out) == 1 + + content = out[0]["content"] + assert isinstance(content, list) and len(content) == 3 + + # Text + assert content[0]["type"] in {"input_text", "text"} + assert content[0]["text"] == "hello" + + # Image โ†’ input_image with data URI + assert content[1]["type"] == "input_image" + assert isinstance(content[1].get("image_url"), str) + assert content[1]["image_url"].startswith("data:image/png;base64,") + + # PDF โ†’ input_file with data URI + assert content[2]["type"] == "input_file" + assert isinstance(content[2].get("file_data"), str) + assert content[2]["file_data"].startswith("data:application/pdf;base64,") + + +def test_pdf_from_url(): + # URL without extension โ†’ should HEAD and set media_type; data stays None. + with patch("instructor.processing.multimodal.requests.head") as mock_head: + resp = MagicMock() + resp.headers = {"Content-Type": "application/pdf"} + resp.raise_for_status = MagicMock() + mock_head.return_value = resp + + pdf = PDF.from_url("https://example.com/file") + + assert isinstance(pdf, PDF) + assert pdf.source == "https://example.com/file" + assert pdf.media_type == "application/pdf" + assert pdf.data is None + + +def test_pdf_from_gs_url(): + # gs:// โ†’ https://storage.googleapis.com/... (GET) and bytes are base64-encoded. + pdf_bytes = b"%PDF-1.4\n..." + with patch("instructor.processing.multimodal.requests.get") as mock_get: + resp = MagicMock() + resp.headers = {"Content-Type": "application/pdf"} + resp.content = pdf_bytes + resp.raise_for_status = MagicMock() + mock_get.return_value = resp + + pdf = PDF.from_gs_url("gs://bucket/doc.pdf") + + assert isinstance(pdf, PDF) + assert pdf.source == "gs://bucket/doc.pdf" + assert pdf.media_type == "application/pdf" + # Optional strictness without adding global imports: + import base64 as _b64 + + assert pdf.data == _b64.b64encode(pdf_bytes).decode("utf-8") + + +def test_audio_from_url(): + # Audio URL โ†’ GET; implementation reads headers.get('content-type') + audio_bytes = b"RIFFxxxxWAVEfmt " + with patch("instructor.processing.multimodal.requests.get") as mock_get: + resp = MagicMock() + resp.headers = {"content-type": "audio/wav"} + resp.content = audio_bytes + resp.raise_for_status = MagicMock() + mock_get.return_value = resp + + audio = Audio.from_url("https://cdn.example.com/a.wav") + + assert isinstance(audio, Audio) + assert audio.source == "https://cdn.example.com/a.wav" + assert audio.media_type == "audio/wav" + import base64 as _b64 + + assert audio.data == _b64.b64encode(audio_bytes).decode("utf-8") + + +def test_audio_from_gs_url(): + # gs:// audio โ†’ public GCS GET and base64-encode. + audio_bytes = b"\x00\x01\x02\x03" + with patch("instructor.processing.multimodal.requests.get") as mock_get: + resp = MagicMock() + resp.headers = {"Content-Type": "audio/mpeg"} + resp.content = audio_bytes + resp.raise_for_status = MagicMock() + mock_get.return_value = resp + + audio = Audio.from_gs_url("gs://bkt/path/song.mp3") + + assert isinstance(audio, Audio) + assert audio.source == "gs://bkt/path/song.mp3" + assert audio.media_type == "audio/mpeg" + import base64 as _b64 + + assert audio.data == _b64.b64encode(audio_bytes).decode("utf-8") + + +def test_audio_from_base64(): + # data:audio/* data URI โ†’ parsed without network. + import base64 as _b64 + + raw = b"\x11\x22\x33\x44" + uri = "data:audio/wav;base64," + _b64.b64encode(raw).decode("utf-8") + + audio = Audio.from_base64(uri) + + assert isinstance(audio, Audio) + assert audio.source == uri + assert audio.media_type == "audio/wav" + assert audio.data == _b64.b64encode(raw).decode("utf-8") diff --git a/uv.lock b/uv.lock index 97f5d8f06..59f7bf6ce 100644 --- a/uv.lock +++ b/uv.lock @@ -1753,7 +1753,7 @@ wheels = [ [[package]] name = "instructor" -version = "1.11.1" +version = "1.11.2" source = { editable = "." } dependencies = [ { name = "aiohttp" },