EnhancedDocument API Reference¶

Module contains logic for indexing documents into vector stores.

`EnhancedDocument` ¶

Bases: Document

A hashed document with a unique ID.

Source code in src/easy_ingest_text/enhanced_document.py

class EnhancedDocument(Document):
    """A hashed document with a unique ID."""

    source: str
    """The file path of the document."""
    document_hash: str
    """The hash of the document including content and metadata."""
    content_hash: str
    """The hash of the document content."""
    metadata_hash: str
    """The hash of the document metadata."""

    @root_validator(pre=True)
    def calculate_hashes_and_source(cls, values) -> Dict[str, Any]:
        """Calculate content, metadata and overall document hash.

        Also, update the metadata to include these hashes in there, in order
        to make it easier to query on them if required.
        """
        content = values.get("page_content")
        metadata = values.get("metadata")

        if "source" not in metadata:
            raise KeyError(
                "'source' not found in metadata. Each EnhancedDocument must "
                "have a source."
            )

        values["source"] = metadata["source"]

        forbidden_keys = ("document_hash", "content_hash", "metadata_hash")

        # HACK(STP): If we're reloading EnhancedDocuments from their JSON
        # representation, the forbidden keys will already be present. We
        # simply use them here.
        if all(key in metadata for key in forbidden_keys):
            for key in forbidden_keys:
                values[key] = metadata[key]

        else:
            for key in forbidden_keys:
                if key in metadata:
                    raise ValueError(
                        f"Metadata cannot contain key {key} as it "
                        f"is reserved for internal use."
                    )

            content_hash = str(_hash_string_to_uuid(content))

            try:
                metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
            except Exception as e:
                raise ValueError(
                    f"Failed to hash metadata: {e}. "
                    f"Please use a dict that can be serialized using json."
                )

            document_hash = str(
                _hash_string_to_uuid(content_hash + metadata_hash)
            )

            # Update metadata with hashes
            hashes = {}
            hashes["content_hash"] = content_hash
            hashes["metadata_hash"] = metadata_hash
            hashes["document_hash"] = document_hash
            metadata.update(hashes)

            # Set hash values in the model
            # Ensure values are explicitly set
            values["content_hash"] = content_hash
            values["metadata_hash"] = metadata_hash
            values["document_hash"] = document_hash
        return values

    def to_document(self) -> Document:
        """Return a Document object."""
        return Document(
            page_content=self.page_content,
            metadata=self.metadata,
        )

    @classmethod
    def from_document(
        cls, document: Document, *, uid: Optional[str] = None
    ) -> EnhancedDocument:
        """Create a HashedDocument from a Document."""
        return cls(  # type: ignore[call-arg]
            uid=uid,  # type: ignore[arg-type]
            page_content=document.page_content,
            metadata=document.metadata,
        )

    @classmethod
    def remove_hashes(cls, document: Document) -> Document:
        forbidden_keys = ("document_hash", "content_hash", "metadata_hash")
        metadata = document.metadata
        for key in forbidden_keys:
            if key in metadata:
                del metadata[key]
        return document

`content_hash: str` `instance-attribute` ¶

The hash of the document content.

`document_hash: str` `instance-attribute` ¶

The hash of the document including content and metadata.

`metadata_hash: str` `instance-attribute` ¶

The hash of the document metadata.

`source: str` `instance-attribute` ¶

The file path of the document.

`calculate_hashes_and_source(values)` ¶

Calculate content, metadata and overall document hash.

Also, update the metadata to include these hashes in there, in order to make it easier to query on them if required.

Source code in src/easy_ingest_text/enhanced_document.py

@root_validator(pre=True)
def calculate_hashes_and_source(cls, values) -> Dict[str, Any]:
    """Calculate content, metadata and overall document hash.

    Also, update the metadata to include these hashes in there, in order
    to make it easier to query on them if required.
    """
    content = values.get("page_content")
    metadata = values.get("metadata")

    if "source" not in metadata:
        raise KeyError(
            "'source' not found in metadata. Each EnhancedDocument must "
            "have a source."
        )

    values["source"] = metadata["source"]

    forbidden_keys = ("document_hash", "content_hash", "metadata_hash")

    # HACK(STP): If we're reloading EnhancedDocuments from their JSON
    # representation, the forbidden keys will already be present. We
    # simply use them here.
    if all(key in metadata for key in forbidden_keys):
        for key in forbidden_keys:
            values[key] = metadata[key]

    else:
        for key in forbidden_keys:
            if key in metadata:
                raise ValueError(
                    f"Metadata cannot contain key {key} as it "
                    f"is reserved for internal use."
                )

        content_hash = str(_hash_string_to_uuid(content))

        try:
            metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
        except Exception as e:
            raise ValueError(
                f"Failed to hash metadata: {e}. "
                f"Please use a dict that can be serialized using json."
            )

        document_hash = str(
            _hash_string_to_uuid(content_hash + metadata_hash)
        )

        # Update metadata with hashes
        hashes = {}
        hashes["content_hash"] = content_hash
        hashes["metadata_hash"] = metadata_hash
        hashes["document_hash"] = document_hash
        metadata.update(hashes)

        # Set hash values in the model
        # Ensure values are explicitly set
        values["content_hash"] = content_hash
        values["metadata_hash"] = metadata_hash
        values["document_hash"] = document_hash
    return values

`from_document(document, *, uid=None)` `classmethod` ¶

Create a HashedDocument from a Document.

Source code in src/easy_ingest_text/enhanced_document.py

@classmethod
def from_document(
    cls, document: Document, *, uid: Optional[str] = None
) -> EnhancedDocument:
    """Create a HashedDocument from a Document."""
    return cls(  # type: ignore[call-arg]
        uid=uid,  # type: ignore[arg-type]
        page_content=document.page_content,
        metadata=document.metadata,
    )

`to_document()` ¶

Return a Document object.

Source code in src/easy_ingest_text/enhanced_document.py

def to_document(self) -> Document:
    """Return a Document object."""
    return Document(
        page_content=self.page_content,
        metadata=self.metadata,
    )

EnhancedDocument API Reference¶