Add extract_metadata to VLM table extractor for LLM stat extraction (#1543)

austin-aryn-ai · web-flow · commit f84f4d3bafa2 · 2025-12-17T15:06:12.000-08:00
* Add extract_metadata to VLM table extractor for LLM stat extraction

* Address comments

* Fix lint

* Fix unit tests

* Fix lint
diff --git a/lib/sycamore/sycamore/llms/chained_llm.py b/lib/sycamore/sycamore/llms/chained_llm.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Optional, Callable
+from typing import Optional, Callable, Any
 
 from sycamore.llms import LLM
 from sycamore.llms.config import LLMModel, ChainedModel
@@ -50,6 +50,13 @@ def chain(self) -> list[LLM]:
         """
         return self._chain
 
+    # TODO implement this method for ChainedLLM
+    def generate_metadata(
+        self, *, prompt: RenderedPrompt, llm_kwargs: Optional[dict] = None, model: Optional[LLMModel] = None
+    ) -> dict[str, Any]:
+        output = self.generate(prompt=prompt, llm_kwargs=llm_kwargs, model=model)
+        return {"output": output}
+
     def generate(
         self, *, prompt: RenderedPrompt, llm_kwargs: Optional[dict] = None, model: Optional[LLMModel] = None
     ) -> str:
diff --git a/lib/sycamore/sycamore/transforms/table_structure/extract.py b/lib/sycamore/sycamore/transforms/table_structure/extract.py
@@ -540,9 +540,17 @@ def __init__(self, llm: LLM, prompt_str: str = EXTRACT_TABLE_STRUCTURE_PROMPT):
         self.prompt_str = prompt_str
 
     def extract(self, element: TableElement, doc_image: Image.Image) -> TableElement:
+        ret: dict = self.extract_metadata(element, doc_image)
+        table_element = ret.get("output")
+        assert isinstance(table_element, TableElement)
+        return table_element
+
+    def extract_metadata(
+        self, element: TableElement, doc_image: Image.Image, llm_kwargs: Optional[dict] = None
+    ) -> dict[str, Any]:
         # We need a bounding box to be able to do anything.
         if element.bbox is None:
-            return element
+            return {"output": element}
 
         cropped_image, _ = _crop_bbox(doc_image, element.bbox)
 
@@ -561,22 +569,24 @@ def response_checker(response: str) -> bool:
             self.llm.response_checker = response_checker
 
         try:
-            res: str = self.llm.generate(prompt=prompt)
+            res_with_md: dict[str, Any] = self.llm.generate_metadata(prompt=prompt, llm_kwargs=llm_kwargs)
 
+            res = res_with_md.pop("output")
             if res.startswith("```html"):
                 res = res[7:].rstrip("`")
             res = res.strip()
 
             table = Table.from_html(res)
             element.table = table
-            return element
+            res_with_md.update({"output": element})
+            return res_with_md
         except Exception as e:
             tb_str = "".join(traceback.format_exception(type(e), e, e.__traceback__))
             logging.warning(
                 f"Failed to extract a table due to:\n{tb_str}\nReturning the original element without a table."
             )
 
-        return element
+        return {"output": element}
 
 
 DEFAULT_TABLE_STRUCTURE_EXTRACTOR = TableTransformerStructureExtractor