Spaces:

AlhitawiMohammed22
/

HTD_HTR

Runtime error

App Files Files Community

AlhitawiMohammed22 commited on Sep 9, 2023

Commit

ff135d3

1 Parent(s): 9002e70

Create Builder Script

Browse files

Files changed (1) hide show

builder.py +305 -0

builder.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# Copyright (C) 2021, Mindee.
+# This program is licensed under the Apache License version 2.
+# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+from typing import Any, Dict, List, Tuple
+import pandas as pd
+import numpy as np
+from scipy.cluster.hierarchy import fclusterdata
+from doctr.utils.geometry import estimate_page_angle, resolve_enclosing_bbox, resolve_enclosing_rbbox, rotate_boxes
+from doctr.utils.repr import NestedObject
+__all__ = ['DocumentBuilder']
+class DocumentBuilder(NestedObject):
+    """Implements a document builder
+    Args:
+        resolve_lines: whether words should be automatically grouped into lines
+        resolve_blocks: whether lines should be automatically grouped into blocks
+        paragraph_break: relative length of the minimum space separating paragraphs
+        export_as_straight_boxes: if True, force straight boxes in the export (fit a rectangle
+            box to all rotated boxes). Else, keep the boxes format unchanged, no matter what it is.
+    """
+    def __init__(
+        self,
+        resolve_lines: bool = True,
+        resolve_blocks: bool = True,
+        paragraph_break: float = 0.035,
+        export_as_straight_boxes: bool = False,
+    ) -> None:
+        self.resolve_lines = resolve_lines
+        self.resolve_blocks = resolve_blocks
+        self.paragraph_break = paragraph_break
+        self.export_as_straight_boxes = export_as_straight_boxes
+    @staticmethod
+    def _sort_boxes(boxes: np.ndarray) -> np.ndarray:
+        """Sort bounding boxes from top to bottom, left to right
+        Args:
+            boxes: bounding boxes of shape (N, 4) or (N, 4, 2) (in case of rotated bbox)
+        Returns:
+            tuple: indices of ordered boxes of shape (N,), boxes
+                If straight boxes are passed tpo the function, boxes are unchanged
+                else: boxes returned are straight boxes fitted to the straightened rotated boxes
+                so that we fit the lines afterwards to the straigthened page
+        """
+        if boxes.ndim == 3:
+            boxes = rotate_boxes(
+                loc_preds=boxes,
+                angle=-estimate_page_angle(boxes),
+                orig_shape=(1024, 1024),
+                min_angle=5.,
+            )
+            boxes = np.concatenate((boxes.min(1), boxes.max(1)), -1)
+        return (boxes[:, 0] + 2 * boxes[:, 3] / np.median(boxes[:, 3] - boxes[:, 1])).argsort(), boxes
+    def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: List[int]) -> List[List[int]]:
+        """Split a line in sub_lines
+        Args:
+            boxes: bounding boxes of shape (N, 4)
+            word_idcs: list of indexes for the words of the line
+        Returns:
+            A list of (sub-)lines computed from the original line (words)
+        """
+        lines = []
+        # Sort words horizontally
+        word_idcs = [word_idcs[idx]
+                     for idx in boxes[word_idcs, 0].argsort().tolist()]
+        # Eventually split line horizontally
+        if len(word_idcs) < 2:
+            lines.append(word_idcs)
+        else:
+            sub_line = [word_idcs[0]]
+            for i in word_idcs[1:]:
+                horiz_break = True
+                prev_box = boxes[sub_line[-1]]
+                # Compute distance between boxes
+                dist = boxes[i, 0] - prev_box[2]
+                # If distance between boxes is lower than paragraph break, same sub-line
+                if dist < self.paragraph_break:
+                    horiz_break = False
+                if horiz_break:
+                    lines.append(sub_line)
+                    sub_line = []
+                sub_line.append(i)
+            lines.append(sub_line)
+        return lines
+    def _resolve_lines(self, boxes: np.ndarray) -> List[List[int]]:
+        """Order boxes to group them in lines
+        Args:
+            boxes: bounding boxes of shape (N, 4) or (N, 4, 2) in case of rotated bbox
+        Returns:
+            nested list of box indices
+        """
+        # Sort boxes, and straighten the boxes if they are rotated
+        idxs, boxes = self._sort_boxes(boxes)
+        # Compute median for boxes heights
+        y_med = np.median(boxes[:, 3] - boxes[:, 1])
+        lines = []
+        words = [idxs[0]]  # Assign the top-left word to the first line
+        # Define a mean y-center for the line
+        y_center_sum = boxes[idxs[0]][[1, 3]].mean()
+        for idx in idxs[1:]:
+            vert_break = True
+            # Compute y_dist
+            y_dist = abs(boxes[idx][[1, 3]].mean() - y_center_sum / len(words))
+            # If y-center of the box is close enough to mean y-center of the line, same line
+            if y_dist < y_med / 2:
+                vert_break = False
+            if vert_break:
+                # Compute sub-lines (horizontal split)
+                lines.extend(self._resolve_sub_lines(boxes, words))
+                words = []
+                y_center_sum = 0
+            words.append(idx)
+            y_center_sum += boxes[idx][[1, 3]].mean()
+        # Use the remaining words to form the last(s) line(s)
+        if len(words) > 0:
+            # Compute sub-lines (horizontal split)
+            lines.extend(self._resolve_sub_lines(boxes, words))
+        return lines
+    @staticmethod
+    def _resolve_blocks(boxes: np.ndarray, lines: List[List[int]]) -> List[List[List[int]]]:
+        """Order lines to group them in blocks
+        Args:
+            boxes: bounding boxes of shape (N, 4) or (N, 4, 2)
+            lines: list of lines, each line is a list of idx
+        Returns:
+            nested list of box indices
+        """
+        # Resolve enclosing boxes of lines
+        if boxes.ndim == 3:
+            box_lines = np.asarray([
+                resolve_enclosing_rbbox(
+                    [tuple(boxes[idx, :, :]) for idx in line])
+                for line in lines  # type: ignore[misc]
+            ])
+        else:
+            _box_lines = [
+                resolve_enclosing_bbox([
+                    # type: ignore[misc]
+                    (tuple(boxes[idx, :2]), tuple(boxes[idx, 2:])) for idx in line
+                ])
+                for line in lines
+            ]
+            box_lines = np.asarray([(x1, y1, x2, y2)
+                                   for ((x1, y1), (x2, y2)) in _box_lines])
+        # Compute geometrical features of lines to clusterize
+        # Clusterizing only with box centers yield to poor results for complex documents
+        if boxes.ndim == 3:
+            box_features = np.stack(
+                (
+                    (box_lines[:, 0, 0] + box_lines[:, 0, 1]) / 2,
+                    (box_lines[:, 0, 0] + box_lines[:, 2, 0]) / 2,
+                    (box_lines[:, 0, 0] + box_lines[:, 2, 1]) / 2,
+                    (box_lines[:, 0, 1] + box_lines[:, 2, 1]) / 2,
+                    (box_lines[:, 0, 1] + box_lines[:, 2, 0]) / 2,
+                    (box_lines[:, 2, 0] + box_lines[:, 2, 1]) / 2,
+                ), axis=-1
+            )
+        else:
+            box_features = np.stack(
+                (
+                    (box_lines[:, 0] + box_lines[:, 3]) / 2,
+                    (box_lines[:, 1] + box_lines[:, 2]) / 2,
+                    (box_lines[:, 0] + box_lines[:, 2]) / 2,
+                    (box_lines[:, 1] + box_lines[:, 3]) / 2,
+                    box_lines[:, 0],
+                    box_lines[:, 1],
+                ), axis=-1
+            )
+        # Compute clusters
+        clusters = fclusterdata(
+            box_features, t=0.1, depth=4, criterion='distance', metric='euclidean')
+        _blocks: Dict[int, List[int]] = {}
+        # Form clusters
+        for line_idx, cluster_idx in enumerate(clusters):
+            if cluster_idx in _blocks.keys():
+                _blocks[cluster_idx].append(line_idx)
+            else:
+                _blocks[cluster_idx] = [line_idx]
+        # Retrieve word-box level to return a fully nested structure
+        blocks = [[lines[idx] for idx in block] for block in _blocks.values()]
+        return blocks
+    def _build_blocks(self, boxes: np.ndarray, word_preds: List[Tuple[str, float]], page_shapes: List[Tuple[int, int]]) -> Any:
+        """Gather independent words in structured blocks
+        Args:
+            boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2)
+            word_preds: list of all detected words of the page, of shape N
+        Returns:
+            list of block elements
+        """
+        if boxes.shape[0] != len(word_preds):
+            raise ValueError(
+                f"Incompatible argument lengths: {boxes.shape[0]}, {len(word_preds)}")
+        if boxes.shape[0] == 0:
+            return []
+        # Decide whether we try to form lines
+        _boxes = boxes
+        if self.resolve_lines:
+            lines = self._resolve_lines(
+                _boxes if _boxes.ndim == 3 else _boxes[:, :4])
+            # Decide whether we try to form blocks
+            if self.resolve_blocks and len(lines) > 1:
+                _blocks = self._resolve_blocks(
+                    _boxes if _boxes.ndim == 3 else _boxes[:, :4], lines)
+            else:
+                _blocks = [lines]
+        else:
+            # Sort bounding boxes, one line for all boxes, one block for the line
+            lines = [self._sort_boxes(
+                _boxes if _boxes.ndim == 3 else _boxes[:, :4])[0]]
+            _blocks = [lines]
+        rows = []
+        for block_idx, lines in enumerate(_blocks):
+            for line_idx, line in enumerate(lines):
+                for i,idx in enumerate(line):
+                    h, w = page_shapes
+                    row = (
+                        block_idx, line_idx, i, word_preds[idx],
+                        int(round(boxes[idx, 0]*w)
+                            ), int(round(boxes[idx, 1]*h)),
+                        int(round(boxes[idx, 2]*w)
+                            ), int(round(boxes[idx, 3]*h)),
+                        int(round(boxes[idx, 4]*100))
+                    )
+                    rows.append(row)
+        return rows
+    def extra_repr(self) -> str:
+        return (f"resolve_lines={self.resolve_lines}, resolve_blocks={self.resolve_blocks}, "
+                f"paragraph_break={self.paragraph_break}, "
+                f"export_as_straight_boxes={self.export_as_straight_boxes}")
+    def __call__(
+        self,
+        boxes: List[np.ndarray],
+        text_preds: List[List[Tuple[str, float]]],
+        page_shapes: List[Tuple[int, int]]
+    ) -> pd.DataFrame:
+        """Re-arrange detected words into structured blocks
+        Args:
+            boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5)
+                or (*, 6) for all words for a given page
+            text_preds: list of N elements, where each element is the list of all word prediction (text + confidence)
+            page_shape: shape of each page, of size N
+        Returns:
+            document object
+        """
+        if len(boxes) != len(text_preds) or len(boxes) != len(page_shapes):
+            raise ValueError(
+                "All arguments are expected to be lists of the same size")
+        if self.export_as_straight_boxes and len(boxes) > 0:
+            # If boxes are already straight OK, else fit a bounding rect
+            if boxes[0].ndim == 3:
+                straight_boxes = []
+                # Iterate over pages
+                for p_boxes in boxes:
+                    # Iterate over boxes of the pages
+                    straight_boxes.append(np.concatenate(
+                        (p_boxes.min(1), p_boxes.max(1)), 1))
+                boxes = straight_boxes
+        _pages = [
+            pd.DataFrame.from_records(self._build_blocks(page_boxes, word_preds, shape), columns=[
+                "block_num", "line_num", "word_num" ,"word", "xmin", "ymin", "xmax", "ymax", "confidence_score"
+            ])
+            for _idx, shape, page_boxes, word_preds in zip(range(len(boxes)), page_shapes, boxes, text_preds)
+        ]
+        return _pages