Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Ready] Add image_segment_mapper #394

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ process:
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- image_segment_mapper: # perform segment-anything on images and return the bounding boxes.
imgsz: 1024 # image resolution after image resizing
conf: 0.05 # confidence score threshold
iou: 0.5 # IoU (Intersection over Union) score threshold
- image_tagging_mapper: # Mapper to generate image tags.
tag_field_name: '__dj__image_tags__' # the field name to store the tags. It's "__dj__image_tags__" in default.
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
Expand Down
33 changes: 17 additions & 16 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .image_captioning_mapper import ImageCaptioningMapper
from .image_diffusion_mapper import ImageDiffusionMapper
from .image_face_blur_mapper import ImageFaceBlurMapper
from .image_segment_mapper import ImageSegmentMapper
from .image_tagging_mapper import ImageTaggingMapper
from .nlpaug_en_mapper import NlpaugEnMapper
from .nlpcda_zh_mapper import NlpcdaZhMapper
Expand Down Expand Up @@ -72,20 +73,20 @@
'GenerateQAFromExamplesMapper', 'GenerateQAFromTextMapper',
'ImageBlurMapper', 'ImageCaptioningFromGPT4VMapper',
'ImageCaptioningMapper', 'ImageDiffusionMapper', 'ImageFaceBlurMapper',
'ImageTaggingMapper', 'NlpaugEnMapper', 'NlpcdaZhMapper',
'OptimizeQAMapper', 'OptimizeQueryMapper', 'OptimizeResponseMapper',
'PairPreferenceMapper', 'PunctuationNormalizationMapper',
'RemoveBibliographyMapper', 'RemoveCommentsMapper', 'RemoveHeaderMapper',
'RemoveLongWordsMapper', 'RemoveNonChineseCharacterlMapper',
'RemoveRepeatSentencesMapper', 'RemoveSpecificCharsMapper',
'RemoveTableTextMapper', 'RemoveWordsWithIncorrectSubstringsMapper',
'ReplaceContentMapper', 'SentenceSplitMapper', 'TextChunkMapper',
'VideoCaptioningFromAudioMapper', 'VideoCaptioningFromFramesMapper',
'VideoCaptioningFromSummarizerMapper', 'VideoCaptioningFromVideoMapper',
'VideoFFmpegWrappedMapper', 'VideoFaceBlurMapper',
'VideoRemoveWatermarkMapper', 'VideoResizeAspectRatioMapper',
'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper',
'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper',
'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper',
'WhitespaceNormalizationMapper'
'ImageSegmentMapper', 'ImageTaggingMapper', 'NlpaugEnMapper',
'NlpcdaZhMapper', 'OptimizeQAMapper', 'OptimizeQueryMapper',
'OptimizeResponseMapper', 'PairPreferenceMapper',
'PunctuationNormalizationMapper', 'RemoveBibliographyMapper',
'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper',
'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
'SentenceSplitMapper', 'TextChunkMapper', 'VideoCaptioningFromAudioMapper',
'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',
'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper',
'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper',
'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper',
'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper'
]
69 changes: 69 additions & 0 deletions data_juicer/ops/mapper/image_segment_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import numpy as np

from data_juicer.utils.constant import Fields
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, UNFORKABLE, Mapper
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_segment_mapper'

torch = LazyLoader('torch', 'torch')
ultralytics = LazyLoader('ultralytics', 'ultralytics')


@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
@LOADED_IMAGES.register_module(OP_NAME)
class ImageSegmentMapper(Mapper):
"""Perform segment-anything on images and return the bounding boxes."""

_accelerator = 'cuda'

def __init__(self, imgsz=1024, conf=0.05, iou=0.5, *args, **kwargs):
"""
Initialization method.

:param imgsz: resolution for image resizing
:param conf: confidence score threshold
:param iou: IoU (Intersection over Union) score threshold

"""
super().__init__(*args, **kwargs)

self.imgsz = imgsz
self.conf = conf
self.iou = iou

self.model_key = prepare_model(model_type='fastsam',
model_path='FastSAM-x.pt')

def process_single(self, sample, rank=None, context=False):
# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
# N x M x 4 for N images, M boxes, 4 coords
sample[Fields.bbox_tag] = np.empty((0, 0, 4), dtype=np.float32)
return sample

loaded_image_keys = sample[self.image_key]
sample, images = load_data_with_context(sample, context,
loaded_image_keys, load_image)

model = get_model(self.model_key, rank=rank, use_cuda=self.use_cuda())
sample[Fields.bbox_tag] = []

for image in images:
masks = model(image,
retina_masks=True,
imgsz=self.imgsz,
conf=self.conf,
iou=self.iou,
verbose=False)[0]
sample[Fields.bbox_tag].append(masks.boxes.xywh.cpu().numpy())

# match schema
if len(sample[Fields.bbox_tag]) == 0:
sample[Fields.bbox_tag] = np.empty((0, 0, 4), dtype=np.float32)
return sample
4 changes: 2 additions & 2 deletions data_juicer/ops/mapper/image_tagging_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def __init__(self,
"""
super().__init__(*args, **kwargs)
self.model_key = prepare_model(
model_type='recognizeAnything',
pretrained_model_name_or_path='ram_plus_swin_large_14m.pth',
model_type='ram',
model_path='ram_plus_swin_large_14m.pth',
input_size=384)
self.transform = ram.get_transform(image_size=384)
self.tag_field_name = tag_field_name
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/mapper/video_tagging_from_frames_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ def __init__(self,
f'Frame sampling method [{frame_sampling_method}] is not '
f'supported. Can only be one of ["all_keyframes", "uniform"].')
self.model_key = prepare_model(
model_type='recognizeAnything',
pretrained_model_name_or_path='ram_plus_swin_large_14m.pth',
model_type='ram',
model_path='ram_plus_swin_large_14m.pth',
input_size=384)
self.frame_sampling_method = frame_sampling_method
self.frame_num = frame_num
Expand Down
145 changes: 74 additions & 71 deletions data_juicer/utils/auto_install_mapping.py
Original file line number Diff line number Diff line change
@@ -1,108 +1,111 @@
# Map the imported module to the require package we need to install
# keep sorted for maintainability
MODULE_TO_PKGS = {
'PIL': ['Pillow'],
'aesthetics_predictor': ['simple-aesthetics-predictor'],
'cv2': ['opencv-python'],
'fasttext': ['fasttext-wheel'],
'ffmpeg': ['ffmpeg-python'],
'PIL': ['Pillow'],
'ram': ['ram@git+https://github.com/xinyu1205/recognize-anything.git'],
'scenedetect': ['scenedetect[opencv]'],
'simhash': ['simhash-pybind'],
'simhash': ['simhash-pybind']
}

# Packages to corresponding ops that require them
# keep sorted for maintainability
PKG_TO_OPS = {
'torch': [
'image_aesthetics_filter', 'image_nsfw_filter',
'image_text_matching_filter', 'image_text_similarity_filter',
'image_watermark_filter', 'phrase_grounding_recall_filter',
'video_aesthetics_filter', 'video_frames_text_similarity_filter',
'video_nsfw_filter', 'video_tagging_from_frames_filter',
'video_watermark_filter', 'generate_qa_from_text_mapper',
'generate_qa_from_examples_mapper', 'image_captioning_mapper',
'image_diffusion_mapper', 'image_tagging_mapper',
'optimize_query_mapper', 'optimize_response_mapper',
'optimize_qa_mapper', 'video_captioning_from_frames_mapper',
'video_captioning_from_summarizer_mapper',
'video_captioning_from_video_mapper',
'video_tagging_from_audio_mapper', 'video_tagging_from_frames_mapper'
],
'torchaudio': [
'video_captioning_from_summarizer_mapper',
'video_tagging_from_audio_mapper'
'accelerate': [
'video_captioning_from_audio_mapper',
'video_captioning_from_summarizer_mapper'
],
'diffusers': ['image_diffusion_mapper'],
'easyocr': ['video_ocr_area_ratio_filter'],
'einops': [
'video_captioning_from_audio_mapper',
'video_captioning_from_summarizer_mapper'
],
'fasttext-wheel': ['language_id_score_filter'],
'ffmpeg-python': [
'audio_ffmpeg_wrapped_mapper', 'video_ffmpeg_wrapped_mapper',
'video_resize_aspect_ratio_mapper', 'video_resize_resolution_mapper'
],
'ftfy': ['fix_unicode_mapper'],
'imagededup': ['image_deduplicator', 'ray_image_deduplicator'],
'kenlm': ['perplexity_filter'],
'nlpaug': ['nlpaug_en_mapper'],
'nlpcda': ['nlpcda'],
'nltk': ['phrase_grounding_recall_filter', 'sentence_split_mapper'],
'opencc': ['chinese_convert_mapper'],
'opencv-python': [
'image_face_blur_mapper', 'image_face_ratio_filter',
'video_face_blur_mapper', 'video_motion_score_filter',
'video_remove_watermark_mapper'
],
'ram': ['image_tagging_mapper', 'video_tagging_from_frames_mapper'],
'rouge': ['generate_qa_from_examples_mapper'],
'scenedetect[opencv]': ['video_split_by_scene_mapper'],
'scipy': ['document_minhash_deduplicator'],
'selectolax': ['clean_html_mapper'],
'sentencepiece': [
'flagged_words_filter', 'perplexity_filter', 'stopwords_filter',
'word_repetition_filter', 'words_num_filter'
],
'scipy': ['document_minhash_deduplicator'],
'ftfy': ['fix_unicode_mapper'],
'simhash-pybind': [
'document_simhash_deduplicator', 'image_captioning_mapper',
'image_diffusion_mapper', 'video_captioning_from_frames_mapper',
'video_captioning_from_summarizer_mapper',
'video_captioning_from_video_mapper'
],
'selectolax': ['clean_html_mapper'],
'nlpaug': ['nlpaug_en_mapper'],
'nlpcda': ['nlpcda'],
'nltk': ['phrase_grounding_recall_filter', 'sentence_split_mapper'],
'transformers': [
'alphanumeric_filter', 'image_aesthetics_filter', 'image_nsfw_filter',
'image_text_matching_filter', 'image_text_similarity_filter',
'image_watermark_filter', 'phrase_grounding_recall_filter',
'token_num_filter', 'video_aesthetics_filter',
'video_frames_text_similarity_filter', 'video_nsfw_filter',
'generate_qa_from_text_mapper', 'generate_qa_from_examples_mapper',
'image_captioning_mapper', 'image_diffusion_mapper',
'optimize_query_mapper', 'optimize_response_mapper',
'optimize_qa_mapper', 'video_captioning_from_audio_mapper',
'video_captioning_from_frames_mapper',
'video_captioning_from_summarizer_mapper',
'video_captioning_from_video_mapper', 'video_tagging_from_audio_mapper'
],
'transformers_stream_generator': [
'simple-aesthetics-predictor':
['image_aesthetics_filter', 'video_aesthetics_filter'],
'spacy-pkuseg': ['text_action_filter', 'text_entity_dependency_filter'],
'tiktoken': [
'video_captioning_from_audio_mapper',
'video_captioning_from_summarizer_mapper'
],
'einops': [
'video_captioning_from_audio_mapper',
'video_captioning_from_summarizer_mapper'
'torch': [
'generate_qa_from_examples_mapper', 'generate_qa_from_text_mapper',
'image_aesthetics_filter', 'image_captioning_mapper',
'image_diffusion_mapper', 'image_nsfw_filter', 'image_segment_mapper',
'image_tagging_mapper', 'image_text_matching_filter',
'image_text_similarity_filter', 'image_watermark_filter',
'optimize_qa_mapper', 'optimize_query_mapper',
'optimize_response_mapper', 'phrase_grounding_recall_filter',
'video_aesthetics_filter', 'video_captioning_from_frames_mapper',
'video_captioning_from_summarizer_mapper',
'video_captioning_from_video_mapper',
'video_frames_text_similarity_filter', 'video_nsfw_filter',
'video_tagging_from_audio_mapper', 'video_tagging_from_frames_filter',
'video_tagging_from_frames_mapper', 'video_watermark_filter'
],
'accelerate': [
'torchaudio': [
'video_captioning_from_summarizer_mapper',
'video_tagging_from_audio_mapper'
],
'transformers': [
'alphanumeric_filter', 'generate_qa_from_examples_mapper',
'generate_qa_from_text_mapper', 'image_aesthetics_filter',
'image_captioning_mapper', 'image_diffusion_mapper',
'image_nsfw_filter', 'image_text_matching_filter',
'image_text_similarity_filter', 'image_watermark_filter',
'optimize_qa_mapper', 'optimize_query_mapper',
'optimize_response_mapper', 'phrase_grounding_recall_filter',
'token_num_filter', 'video_aesthetics_filter',
'video_captioning_from_audio_mapper',
'video_captioning_from_summarizer_mapper'
'video_captioning_from_frames_mapper',
'video_captioning_from_summarizer_mapper',
'video_captioning_from_video_mapper',
'video_frames_text_similarity_filter', 'video_nsfw_filter',
'video_tagging_from_audio_mapper'
],
'tiktoken': [
'transformers_stream_generator': [
'video_captioning_from_audio_mapper',
'video_captioning_from_summarizer_mapper'
],
'opencc': ['chinese_convert_mapper'],
'imagededup': ['image_deduplicator', 'ray_image_deduplicator'],
'spacy-pkuseg': ['text_action_filter', 'text_entity_dependency_filter'],
'diffusers': ['image_diffusion_mapper'],
'simple-aesthetics-predictor':
['image_aesthetics_filter', 'video_aesthetics_filter'],
'scenedetect[opencv]': ['video_split_by_scene_mapper'],
'ffmpeg-python': [
'audio_ffmpeg_wrapped_mapper', 'video_ffmpeg_wrapped_mapper',
'video_resize_aspect_ratio_mapper', 'video_resize_resolution_mapper'
],
'opencv-python': [
'image_face_ratio_filter', 'video_motion_score_filter',
'image_face_blur_mapper', 'video_face_blur_mapper',
'video_remove_watermark_mapper'
],
'ultralytics': ['image_segment_mapper'],
'vllm': [
'generate_qa_from_text_mapper',
'generate_qa_from_examples_mapper',
'optimize_query_mapper',
'optimize_response_mapper',
'optimize_qa_mapper',
],
'rouge': ['generate_qa_from_examples_mapper'],
'ram': ['image_tagging_mapper', 'video_tagging_from_frames_mapper']
'generate_qa_from_examples_mapper', 'generate_qa_from_text_mapper',
'optimize_qa_mapper', 'optimize_query_mapper',
'optimize_response_mapper'
]
}
3 changes: 3 additions & 0 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ class Fields(object):
context = DEFAULT_PREFIX + 'context__'
suffix = DEFAULT_PREFIX + 'suffix__'

# bounding box tag
bbox_tag = DEFAULT_PREFIX + 'bbox__'

# video_frame_tags
video_frame_tags = DEFAULT_PREFIX + 'video_frame_tags__'
video_audio_tags = DEFAULT_PREFIX + 'video_audio_tags__'
Expand Down
Loading
Loading