Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Use NanoIDs in file_scan.
  • Loading branch information
alexaryn committed Nov 22, 2024
commit b5e4ee8f0b06dbb1eb64f3cd7e56c1f08ad261d6
12 changes: 6 additions & 6 deletions lib/sycamore/sycamore/connectors/file/file_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import boto3
import mimetypes
from typing import Any, Optional, Union, Tuple, Callable, TYPE_CHECKING
import uuid
import logging

from pyarrow._fs import FileInfo
from pyarrow.fs import FileSystem, FileSelector
from sycamore.data import Document
from sycamore.plan_nodes import Scan
from sycamore.data.document import mkdocid
from sycamore.utils.time_trace import timetrace

if TYPE_CHECKING:
Expand All @@ -21,7 +21,7 @@


def _set_id(doc: dict[str, Any]) -> dict[str, Any]:
doc["doc_id"] = str(uuid.uuid1())
doc["doc_id"] = mkdocid()
return doc


Expand Down Expand Up @@ -156,7 +156,7 @@ class BinaryScan(FileScan):
"""Scan data file into raw bytes

For each file, BinaryScan creates one Document in the form of
{"doc_id": uuid,
{"doc_id": nanoid,
"content": {"binary": xxx, "text": None},
"properties": {"path": xxx}, "filetype": yyy}.

Expand Down Expand Up @@ -192,7 +192,7 @@ def __init__(
def _to_document(self, dict: dict[str, Any]) -> dict[str, bytes]:
document = Document()

document.doc_id = str(uuid.uuid1())
document.doc_id = mkdocid("f")
document.type = self._binary_format
document.binary_representation = dict["bytes"]

Expand Down Expand Up @@ -242,7 +242,7 @@ def process_file(self, info) -> list[Document]:
binary_data = file.read()

document = Document()
document.doc_id = str(uuid.uuid1())
document.doc_id = mkdocid("f")
document.type = self._binary_format
document.binary_representation = binary_data
document.properties["path"] = info.path
Expand Down Expand Up @@ -287,7 +287,7 @@ def __init__(
def _to_document(self, json_dict: dict[str, Any]) -> list[dict[str, Any]]:
document = Document()

document.doc_id = str(uuid.uuid1())
document.doc_id = mkdocid()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one doesn't get a scheme?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defaults to "d".

document.type = "json"

if self._document_body_field is not None:
Expand Down
Loading