Merge pull request #9 from RapidAI/develop

SWHL · web-flow · commit 103602a542e3 · 2024-12-24T08:31:07.000+08:00
fix: fixed issue #3 #7 #8
diff --git a/1.jpg b/1.jpg
diff --git a/README.md b/README.md
@@ -29,16 +29,18 @@
 | `yolov8n_layout_report`|   研报   |   `yolov8n_layout_report.onnx`    | `['Text', 'Title', 'Header', 'Footer', 'Figure', 'Table', 'Toc', 'Figure caption', 'Table caption']` |
 | `yolov8n_layout_publaynet`|   英文   |   `yolov8n_layout_publaynet.onnx`    | `["Text", "Title", "List", "Table", "Figure"]` |
 | `yolov8n_layout_general6`|   通用   |   `yolov8n_layout_general6.onnx`    | `["Text", "Title", "Figure", "Table", "Caption", "Equation"]` |
-| 🔥`doclayout_yolo`|   通用   |   `doclayout_yolo_docstructbench_imgsz1024.onnx`    | `['title', 'text', 'abandon', 'figure', 'figure_caption', 'table', 'table_caption', 'table_footnote', 'isolate_formula', 'formula_caption']` |
+| 🔥`doclayout_docstructbench`|   通用   |   `doclayout_yolo_docstructbench_imgsz1024.onnx`    | `['title', 'plain text', 'abandon', 'figure', 'figure_caption', 'table', 'table_caption', 'table_footnote', 'isolate_formula', 'formula_caption']` |
+| 🔥`doclayout_d4la`|   通用   |   `doclayout_yolo_d4la_imgsz1600_docsynth_pretrain.onnx`    | `['DocTitle', 'ParaTitle', 'ParaText', 'ListText', 'RegionTitle', 'Date', 'LetterHead', 'LetterDear', 'LetterSign', 'Question', 'OtherText', 'RegionKV', 'RegionList', 'Abstract', 'Author', 'TableName', 'Table', 'Figure', 'FigureName', 'Equation', 'Reference', 'Footer', 'PageHeader', 'PageFooter', 'Number', 'Catalog', 'PageNumber']` |
+| 🔥`doclayout_docsynth`|   通用   |   `doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.onnx`    | `['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']` |
 
 PP模型来源：[PaddleOCR 版面分析](https://github.com/PaddlePaddle/PaddleOCR/blob/133d67f27dc8a241d6b2e30a9f047a0fb75bebbe/ppstructure/layout/README_ch.md)
 
 yolov8n系列来源：[360LayoutAnalysis](https://github.com/360AILAB-NLP/360LayoutAnalysis)
 
-doclayout版本暂时有问题，不推荐使用。正在更新中....
-~~（推荐使用）🔥doclayout_yolo模型来源：[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)，该模型是目前最为优秀的开源模型，支持学术论文、Textbook、Financial、Exam Paper、Fuzzy Scans、PPT和Poster 7种文档类型的版面检测。值得一提的是，该模型支持的类别中存在`abandon`一类，主要是文档页面的页眉页脚部分，便于后续快速舍弃。~~
 
-模型下载地址为：[link](https://github.com/RapidAI/RapidLayout/releases/tag/v0.0.0)
+（推荐使用）🔥doclayout_yolo模型来源：[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)，该模型是目前最为优秀的开源模型，挑选了3个基于不同训练集训练得到的模型。其中`doclayout_docstructbench`来自[link](https://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/tree/main)，`doclayout_d4la`来自[link](https://huggingface.co/juliozhao/DocLayout-YOLO-D4LA-Docsynth300K_pretrained/blob/main/doclayout_yolo_d4la_imgsz1600_docsynth_pretrain.pt)，`doclayout_docsynth`来自[link](https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/tree/main)。
+
+DocLayout模型下载地址为：[link](https://github.com/RapidAI/RapidLayout/releases/tag/v0.0.0)
 
 ### 安装
 
diff --git a/demo.py b/demo.py
@@ -5,12 +5,13 @@
 
 from rapid_layout import RapidLayout, VisLayout
 
-layout_engine = RapidLayout(model_type="doclayout_yolo", conf_thres=0.1)
+layout_engine = RapidLayout(model_type="doclayout_docsynth")
 
 img_path = "tests/test_files/PMC3576793_00004.jpg"
 img = cv2.imread(img_path)
 
-boxes, scores, class_names, elapse = layout_engine(img)
+boxes, scores, class_names, elapse = layout_engine(img_path)
+print(boxes.shape)
 ploted_img = VisLayout.draw_detections(img, boxes, scores, class_names)
 if ploted_img is not None:
     cv2.imwrite("layout_res.png", ploted_img)
diff --git a/rapid_layout/main.py b/rapid_layout/main.py
@@ -35,7 +35,9 @@
     "yolov8n_layout_report": f"{ROOT_URL}/yolov8n_layout_report.onnx",
     "yolov8n_layout_publaynet": f"{ROOT_URL}/yolov8n_layout_publaynet.onnx",
     "yolov8n_layout_general6": f"{ROOT_URL}/yolov8n_layout_general6.onnx",
-    "doclayout_yolo": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024.onnx",
+    "doclayout_docstructbench": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024.onnx",
+    "doclayout_d4la": f"{ROOT_URL}/doclayout_yolo_d4la_imgsz1600_docsynth_pretrain.onnx",
+    "doclayout_docsynth": f"{ROOT_URL}/doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.onnx",
 }
 DEFAULT_MODEL_PATH = str(ROOT_DIR / "models" / "layout_cdla.onnx")
 
diff --git a/rapid_layout/utils/augment.py b/rapid_layout/utils/augment.py
@@ -0,0 +1,85 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+import cv2
+import numpy as np
+
+
+class LetterBox:
+    """Resize image and padding for detection, instance segmentation, pose."""
+
+    def __init__(
+        self,
+        new_shape=(640, 640),
+        auto=False,
+        scaleFill=False,
+        scaleup=True,
+        center=True,
+        stride=32,
+    ):
+        """Initialize LetterBox object with specific parameters."""
+        self.new_shape = new_shape
+        self.auto = auto
+        self.scaleFill = scaleFill
+        self.scaleup = scaleup
+        self.stride = stride
+        self.center = center  # Put the image in the middle or top-left
+
+    def __call__(self, labels=None, image=None):
+        """Return updated labels and image with added border."""
+        if labels is None:
+            labels = {}
+        img = labels.get("img") if image is None else image
+        shape = img.shape[:2]  # current shape [height, width]
+        new_shape = labels.pop("rect_shape", self.new_shape)
+        if isinstance(new_shape, int):
+            new_shape = (new_shape, new_shape)
+
+        # Scale ratio (new / old)
+        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+        if not self.scaleup:  # only scale down, do not scale up (for better val mAP)
+            r = min(r, 1.0)
+
+        # Compute padding
+        ratio = r, r  # width, height ratios
+        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+        if self.auto:  # minimum rectangle
+            dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride)  # wh padding
+        elif self.scaleFill:  # stretch
+            dw, dh = 0.0, 0.0
+            new_unpad = (new_shape[1], new_shape[0])
+            ratio = (
+                new_shape[1] / shape[1],
+                new_shape[0] / shape[0],
+            )  # width, height ratios
+
+        if self.center:
+            dw /= 2  # divide padding into 2 sides
+            dh /= 2
+
+        if shape[::-1] != new_unpad:  # resize
+            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+        top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
+        left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
+        )  # add border
+        if labels.get("ratio_pad"):
+            labels["ratio_pad"] = (labels["ratio_pad"], (left, top))  # for evaluation
+
+        if len(labels):
+            labels = self._update_labels(labels, ratio, dw, dh)
+            labels["img"] = img
+            labels["resized_shape"] = new_shape
+            return labels
+        else:
+            return img
+
+    def _update_labels(self, labels, ratio, padw, padh):
+        """Update labels."""
+        labels["instances"].convert_bbox(format="xyxy")
+        labels["instances"].denormalize(*labels["img"].shape[:2][::-1])
+        labels["instances"].scale(*ratio)
+        labels["instances"].add_padding(padw, padh)
+        return labels
diff --git a/rapid_layout/utils/post_prepross.py b/rapid_layout/utils/post_prepross.py
@@ -299,7 +299,7 @@ def extract_boxes(self, predictions):
 
 
 class DocLayoutPostProcess:
-    def __init__(self, labels: List[str], conf_thres=0.7, iou_thres=0.5):
+    def __init__(self, labels: List[str], conf_thres=0.2, iou_thres=0.5):
         self.labels = labels
         self.conf_threshold = conf_thres
         self.iou_threshold = iou_thres
@@ -308,31 +308,18 @@ def __init__(self, labels: List[str], conf_thres=0.7, iou_thres=0.5):
 
     def __call__(
         self,
-        output,
+        preds,
         ori_img_shape: Tuple[int, int],
         img_shape: Tuple[int, int] = (1024, 1024),
     ):
-        self.img_height, self.img_width = ori_img_shape
-        self.input_height, self.input_width = img_shape
-
-        output = output[0].squeeze()
-        boxes = output[:, :-2]
-        confidences = output[:, -2]
-        class_ids = output[:, -1].astype(int)
-
-        mask = confidences > self.conf_threshold
-        boxes = boxes[mask, :]
-        confidences = confidences[mask]
-        class_ids = class_ids[mask]
-
-        # Rescale boxes to original image dimensions
-        boxes = rescale_boxes(
-            boxes,
-            self.input_width,
-            self.input_height,
-            self.img_width,
-            self.img_height,
-        )
+        preds = preds[0]
+        mask = preds[..., 4] > self.conf_threshold
+        preds = [p[mask[idx]] for idx, p in enumerate(preds)][0]
+        preds[:, :4] = scale_boxes(list(img_shape), preds[:, :4], list(ori_img_shape))
+
+        boxes = preds[:, :4]
+        confidences = preds[:, 4]
+        class_ids = preds[:, 5].astype(int)
         labels = [self.labels[i] for i in class_ids]
         return boxes, confidences, labels
 
@@ -345,6 +332,54 @@ def rescale_boxes(boxes, input_width, input_height, img_width, img_height):
     return boxes
 
 
+def scale_boxes(
+    img1_shape, boxes, img0_shape, ratio_pad=None, padding=True, xywh=False
+):
+    """
+    Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
+    specified in (img1_shape) to the shape of a different image (img0_shape).
+
+    Args:
+        img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
+        boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
+        img0_shape (tuple): the shape of the target image, in the format of (height, width).
+        ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
+            calculated based on the size difference between the two images.
+        padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
+            rescaling.
+        xywh (bool): The box format is xywh or not, default=False.
+
+    Returns:
+        boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
+    """
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(
+            img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]
+        )  # gain  = old / new
+        pad = (
+            round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1),
+            round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1),
+        )  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    if padding:
+        boxes[..., 0] -= pad[0]  # x padding
+        boxes[..., 1] -= pad[1]  # y padding
+        if not xywh:
+            boxes[..., 2] -= pad[0]  # x padding
+            boxes[..., 3] -= pad[1]  # y padding
+    boxes[..., :4] /= gain
+    return clip_boxes(boxes, img0_shape)
+
+
+def clip_boxes(boxes, shape):
+    boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
+    boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2
+    return boxes
+
+
 def nms(boxes, scores, iou_threshold):
     # Sort by score
     sorted_indices = np.argsort(scores)[::-1]
diff --git a/rapid_layout/utils/pre_procss.py b/rapid_layout/utils/pre_procss.py
@@ -7,11 +7,12 @@
 import cv2
 import numpy as np
 
+from .augment import LetterBox
+
 InputType = Union[str, np.ndarray, bytes, Path]
 
 
 class PPPreProcess:
-
     def __init__(self, img_size: Tuple[int, int]):
         self.size = img_size
         self.mean = np.array([0.485, 0.456, 0.406])
@@ -41,7 +42,6 @@ def permute(self, img: np.ndarray) -> np.ndarray:
 
 
 class YOLOv8PreProcess:
-
     def __init__(self, img_size: Tuple[int, int]):
         self.img_size = img_size
 
@@ -54,14 +54,15 @@ def __call__(self, image: np.ndarray) -> np.ndarray:
 
 
 class DocLayoutPreProcess:
-
     def __init__(self, img_size: Tuple[int, int]):
         self.img_size = img_size
+        self.letterbox = LetterBox(new_shape=img_size, auto=False, stride=32)
 
     def __call__(self, image: np.ndarray) -> np.ndarray:
-        input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        input_img = cv2.resize(image, self.img_size)
-        input_img = input_img / 255.0
-        input_img = input_img.transpose(2, 0, 1)
-        input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
+        input_img = self.letterbox(image=image)
+        input_img = input_img[None, ...]
+        input_img = input_img[..., ::-1].transpose(0, 3, 1, 2)
+        input_img = np.ascontiguousarray(input_img)
+        input_img = input_img / 255
+        input_tensor = input_img.astype(np.float32)
         return input_tensor
diff --git a/tests/test_layout.py b/tests/test_layout.py
@@ -26,10 +26,15 @@
     [
         ("yolov8n_layout_publaynet", 12),
         ("yolov8n_layout_general6", 13),
-        ("doclayout_yolo", 14),
+        (
+            "doclayout_docstructbench",
+            14,
+        ),
+        ("doclayout_d4la", 11),
+        ("doclayout_docsynth", 14),
     ],
 )
-def test_yolov8n_layout(model_type, gt):
+def test_layout(model_type, gt):
     img_path = test_file_dir / "PMC3576793_00004.jpg"
     engine = RapidLayout(model_type=model_type)
     boxes, scores, class_names, *elapse = engine(img_path)