Skip to content

Commit

Permalink
Make built-in adapters' identifiers configurable
Browse files Browse the repository at this point in the history
This will allow end users to provide their own lists of extensions and/or
mimetypes for each of the built-in adapters.
  • Loading branch information
lafrenierejm committed Dec 29, 2024
1 parent c97fa3a commit 802976f
Show file tree
Hide file tree
Showing 14 changed files with 634 additions and 328 deletions.
166 changes: 139 additions & 27 deletions src/adapters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pub mod zip;
use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
use anyhow::{format_err, Context, Result};
use async_trait::async_trait;
use custom::Builtin;
use custom::CustomAdapterConfig;
use custom::BUILTIN_SPAWNING_ADAPTERS;
use log::*;
Expand Down Expand Up @@ -38,7 +39,7 @@ pub struct AdapterMeta {
pub fast_matchers: Vec<FastFileMatcher>,
/// list of matchers when we have mime type detection active (interpreted as ORed)
/// warning: this *overrides* the fast matchers
pub slow_matchers: Option<Vec<FileMatcher>>,
pub slow_matchers: Vec<FileMatcher>,
/// if true, slow_matchers is merged with fast matchers if accurate is enabled
/// for example, in sqlite you want this disabled since the db extension can mean other things and the mime type matching is very accurate for sqlite.
/// but for tar you want it enabled, since the tar extension is very accurate but the tar mime matcher can have false negatives
Expand All @@ -48,39 +49,63 @@ pub struct AdapterMeta {
}
impl AdapterMeta {
// todo: this is pretty ugly
pub fn get_matchers<'a>(
&'a self,
slow: bool,
) -> Box<dyn Iterator<Item = Cow<FileMatcher>> + 'a> {
pub fn get_matchers(&self, slow: bool) -> Box<dyn Iterator<Item = Cow<FileMatcher>> + '_> {
match (
slow,
self.keep_fast_matchers_if_accurate,
&self.slow_matchers,
&self.fast_matchers,
) {
(true, false, Some(ref sm)) => Box::new(sm.iter().map(Cow::Borrowed)),
(true, true, Some(ref sm)) => Box::new(
(true, false, sm, _) => Box::new(sm.iter().map(Cow::Borrowed)),
(true, true, sm, fm) => Box::new(
sm.iter().map(Cow::Borrowed).chain(
self.fast_matchers
.iter()
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
fm.iter()
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone())))
.collect::<Vec<_>>(),
),
),
// don't have slow matchers or slow matching disabled
(true, _, None) | (false, _, _) => Box::new(
self.fast_matchers
.iter()
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
),
// slow matching disabled
(false, _, _, fm) => {
Box::new(fm.iter().map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))))
}
}
}
}

pub trait GetMetadata {
fn metadata(&self) -> &AdapterMeta;
pub trait Adapter {
fn name(&self) -> String;
fn version(&self) -> i32;
fn description(&self) -> String;
fn recurses(&self) -> bool;
fn disabled_by_default(&self) -> bool;
fn keep_fast_matchers_if_accurate(&self) -> bool;
fn extensions(&self) -> Vec<String>;
fn mimetypes(&self) -> Vec<String>;

fn metadata(&self) -> AdapterMeta {
return AdapterMeta {
name: self.name(),
version: self.version(),
description: self.description(),
recurses: true,
fast_matchers: self
.extensions()
.iter()
.map(|s| FastFileMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: self
.mimetypes()
.iter()
.map(|mimetype| FileMatcher::MimeType(mimetype.to_string()))
.collect(),
disabled_by_default: self.disabled_by_default(),
keep_fast_matchers_if_accurate: self.keep_fast_matchers_if_accurate(),
};
}
}

#[async_trait]
pub trait FileAdapter: GetMetadata + Send + Sync {
pub trait FileAdapter: Adapter + Send + Sync {
/// adapt a file.
///
/// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
Expand Down Expand Up @@ -109,7 +134,67 @@ pub struct AdaptInfo {
/// (enabledAdapters, disabledAdapters)
type AdaptersTuple = (Vec<Arc<dyn FileAdapter>>, Vec<Arc<dyn FileAdapter>>);

pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> AdaptersTuple {
pub fn get_all_adapters(
custom_extensions: Option<HashMap<String, Builtin>>,
custom_mimetypes: Option<HashMap<String, Builtin>>,
custom_adapters: Option<Vec<CustomAdapterConfig>>,
) -> AdaptersTuple {
let extensions: &mut HashMap<Builtin, Vec<String>> = &mut HashMap::new();
if let Some(ce) = custom_extensions.as_ref() {
for (ext, builtin) in ce {
extensions
.entry(*builtin)
.or_default()
.push(ext.to_string());
}
}
for (builtin, exts) in [
(Builtin::BZ2, decompress::EXTENSIONS_BZ2),
(Builtin::GZ, decompress::EXTENSIONS_GZ),
(Builtin::XZ, decompress::EXTENSIONS_XZ),
(Builtin::ZST, decompress::EXTENSIONS_ZST),
(Builtin::FFMPEG, ffmpeg::EXTENSIONS),
(Builtin::MBOX, mbox::EXTENSIONS),
(Builtin::SQLITE, sqlite::EXTENSIONS),
(Builtin::TAR, tar::EXTENSIONS),
(Builtin::ZIP, zip::EXTENSIONS),
] {
for ext in exts {
if !custom_extensions
.as_ref()
.is_some_and(|ce| ce.contains_key(ext.to_owned()))
{
extensions.entry(builtin).or_default().push(ext.to_string());
}
}
}

let mimetypes: &mut HashMap<Builtin, Vec<String>> = &mut HashMap::new();
if let Some(cm) = custom_mimetypes.as_ref() {
for (mime, builtin) in cm {
mimetypes
.entry(*builtin)
.or_default()
.push(mime.to_string());
}
}
for (builtin, mimes) in [
(Builtin::BZ2, decompress::MIMETYPES_BZ2),
(Builtin::GZ, decompress::MIMETYPES_GZ),
(Builtin::XZ, decompress::MIMETYPES_XZ),
(Builtin::ZST, decompress::MIMETYPES_ZST),
(Builtin::FFMPEG, ffmpeg::MIMETYPES),
(Builtin::MBOX, mbox::MIMETYPES),
(Builtin::SQLITE, sqlite::MIMETYPES),
(Builtin::TAR, tar::MIMETYPES),
(Builtin::ZIP, zip::MIMETYPES),
] {
let val = mimetypes.entry(builtin).or_default();
for mime in mimes {
val.push(mime.to_string());
}
}

// order in descending priority
let mut adapters: Vec<Arc<dyn FileAdapter>> = vec![];
if let Some(custom_adapters) = custom_adapters {
Expand All @@ -120,12 +205,36 @@ pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> Ad

let internal_adapters: Vec<Arc<dyn FileAdapter>> = vec![
Arc::new(PostprocPageBreaks::default()),
Arc::new(ffmpeg::FFmpegAdapter::new()),
Arc::new(zip::ZipAdapter::new()),
Arc::new(decompress::DecompressAdapter::new()),
Arc::new(mbox::MboxAdapter::new()),
Arc::new(tar::TarAdapter::new()),
Arc::new(sqlite::SqliteAdapter::new()),
Arc::new(ffmpeg::FFmpegAdapter {
extensions: extensions[&Builtin::FFMPEG].clone(),
mimetypes: mimetypes[&Builtin::FFMPEG].clone(),
}),
Arc::new(zip::ZipAdapter {
extensions: extensions[&Builtin::ZIP].clone(),
mimetypes: mimetypes[&Builtin::ZIP].clone(),
}),
Arc::new(decompress::DecompressAdapter {
extensions_gz: extensions[&Builtin::GZ].clone(),
extensions_bz2: extensions[&Builtin::BZ2].clone(),
extensions_xz: extensions[&Builtin::XZ].clone(),
extensions_zst: extensions[&Builtin::ZST].clone(),
mimetypes_gz: mimetypes[&Builtin::GZ].clone(),
mimetypes_bz2: mimetypes[&Builtin::BZ2].clone(),
mimetypes_xz: mimetypes[&Builtin::XZ].clone(),
mimetypes_zst: mimetypes[&Builtin::ZST].clone(),
}),
Arc::new(mbox::MboxAdapter {
extensions: extensions[&Builtin::MBOX].clone(),
mimetypes: mimetypes[&Builtin::MBOX].clone(),
}),
Arc::new(sqlite::SqliteAdapter {
extensions: extensions[&Builtin::SQLITE].clone(),
mimetypes: mimetypes[&Builtin::SQLITE].clone(),
}),
Arc::new(tar::TarAdapter {
extensions: extensions[&Builtin::TAR].clone(),
mimetypes: mimetypes[&Builtin::TAR].clone(),
}),
];
adapters.extend(
BUILTIN_SPAWNING_ADAPTERS
Expand All @@ -148,10 +257,13 @@ pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> Ad
* - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority)
*/
pub fn get_adapters_filtered<T: AsRef<str>>(
custom_extensions: Option<HashMap<String, Builtin>>,
custom_identifiers: Option<HashMap<String, Builtin>>,
custom_adapters: Option<Vec<CustomAdapterConfig>>,
adapter_names: &[T],
) -> Result<Vec<Arc<dyn FileAdapter>>> {
let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters);
let (def_enabled_adapters, def_disabled_adapters) =
get_all_adapters(custom_extensions, custom_identifiers, custom_adapters);
let adapters = if !adapter_names.is_empty() {
let adapters_map: HashMap<_, _> = def_enabled_adapters
.iter()
Expand Down
Loading

0 comments on commit 802976f

Please sign in to comment.