EnhancedDocument API Reference

Module contains logic for indexing documents into vector stores.

EnhancedDocument

Bases: Document

A hashed document with a unique ID.

Source code in src/easy_ingest_text/enhanced_document.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
class EnhancedDocument(Document):
    """A hashed document with a unique ID."""

    source: str
    """The file path of the document."""
    document_hash: str
    """The hash of the document including content and metadata."""
    content_hash: str
    """The hash of the document content."""
    metadata_hash: str
    """The hash of the document metadata."""

    @root_validator(pre=True)
    def calculate_hashes_and_source(cls, values) -> Dict[str, Any]:
        """Calculate content, metadata and overall document hash.

        Also, update the metadata to include these hashes in there, in order
        to make it easier to query on them if required.
        """
        content = values.get("page_content")
        metadata = values.get("metadata")

        if "source" not in metadata:
            raise KeyError(
                "'source' not found in metadata. Each EnhancedDocument must "
                "have a source."
            )

        values["source"] = metadata["source"]

        forbidden_keys = ("document_hash", "content_hash", "metadata_hash")

        # HACK(STP): If we're reloading EnhancedDocuments from their JSON
        # representation, the forbidden keys will already be present. We
        # simply use them here.
        if all(key in metadata for key in forbidden_keys):
            for key in forbidden_keys:
                values[key] = metadata[key]

        else:
            for key in forbidden_keys:
                if key in metadata:
                    raise ValueError(
                        f"Metadata cannot contain key {key} as it "
                        f"is reserved for internal use."
                    )

            content_hash = str(_hash_string_to_uuid(content))

            try:
                metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
            except Exception as e:
                raise ValueError(
                    f"Failed to hash metadata: {e}. "
                    f"Please use a dict that can be serialized using json."
                )

            document_hash = str(
                _hash_string_to_uuid(content_hash + metadata_hash)
            )

            # Update metadata with hashes
            hashes = {}
            hashes["content_hash"] = content_hash
            hashes["metadata_hash"] = metadata_hash
            hashes["document_hash"] = document_hash
            metadata.update(hashes)

            # Set hash values in the model
            # Ensure values are explicitly set
            values["content_hash"] = content_hash
            values["metadata_hash"] = metadata_hash
            values["document_hash"] = document_hash
        return values

    def to_document(self) -> Document:
        """Return a Document object."""
        return Document(
            page_content=self.page_content,
            metadata=self.metadata,
        )

    @classmethod
    def from_document(
        cls, document: Document, *, uid: Optional[str] = None
    ) -> EnhancedDocument:
        """Create a HashedDocument from a Document."""
        return cls(  # type: ignore[call-arg]
            uid=uid,  # type: ignore[arg-type]
            page_content=document.page_content,
            metadata=document.metadata,
        )

    @classmethod
    def remove_hashes(cls, document: Document) -> Document:
        forbidden_keys = ("document_hash", "content_hash", "metadata_hash")
        metadata = document.metadata
        for key in forbidden_keys:
            if key in metadata:
                del metadata[key]
        return document

content_hash: str instance-attribute

The hash of the document content.

document_hash: str instance-attribute

The hash of the document including content and metadata.

metadata_hash: str instance-attribute

The hash of the document metadata.

source: str instance-attribute

The file path of the document.

calculate_hashes_and_source(values)

Calculate content, metadata and overall document hash.

Also, update the metadata to include these hashes in there, in order to make it easier to query on them if required.

Source code in src/easy_ingest_text/enhanced_document.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
@root_validator(pre=True)
def calculate_hashes_and_source(cls, values) -> Dict[str, Any]:
    """Calculate content, metadata and overall document hash.

    Also, update the metadata to include these hashes in there, in order
    to make it easier to query on them if required.
    """
    content = values.get("page_content")
    metadata = values.get("metadata")

    if "source" not in metadata:
        raise KeyError(
            "'source' not found in metadata. Each EnhancedDocument must "
            "have a source."
        )

    values["source"] = metadata["source"]

    forbidden_keys = ("document_hash", "content_hash", "metadata_hash")

    # HACK(STP): If we're reloading EnhancedDocuments from their JSON
    # representation, the forbidden keys will already be present. We
    # simply use them here.
    if all(key in metadata for key in forbidden_keys):
        for key in forbidden_keys:
            values[key] = metadata[key]

    else:
        for key in forbidden_keys:
            if key in metadata:
                raise ValueError(
                    f"Metadata cannot contain key {key} as it "
                    f"is reserved for internal use."
                )

        content_hash = str(_hash_string_to_uuid(content))

        try:
            metadata_hash = str(_hash_nested_dict_to_uuid(metadata))
        except Exception as e:
            raise ValueError(
                f"Failed to hash metadata: {e}. "
                f"Please use a dict that can be serialized using json."
            )

        document_hash = str(
            _hash_string_to_uuid(content_hash + metadata_hash)
        )

        # Update metadata with hashes
        hashes = {}
        hashes["content_hash"] = content_hash
        hashes["metadata_hash"] = metadata_hash
        hashes["document_hash"] = document_hash
        metadata.update(hashes)

        # Set hash values in the model
        # Ensure values are explicitly set
        values["content_hash"] = content_hash
        values["metadata_hash"] = metadata_hash
        values["document_hash"] = document_hash
    return values

from_document(document, *, uid=None) classmethod

Create a HashedDocument from a Document.

Source code in src/easy_ingest_text/enhanced_document.py
115
116
117
118
119
120
121
122
123
124
@classmethod
def from_document(
    cls, document: Document, *, uid: Optional[str] = None
) -> EnhancedDocument:
    """Create a HashedDocument from a Document."""
    return cls(  # type: ignore[call-arg]
        uid=uid,  # type: ignore[arg-type]
        page_content=document.page_content,
        metadata=document.metadata,
    )

to_document()

Return a Document object.

Source code in src/easy_ingest_text/enhanced_document.py
108
109
110
111
112
113
def to_document(self) -> Document:
    """Return a Document object."""
    return Document(
        page_content=self.page_content,
        metadata=self.metadata,
    )