Skip to content

Commit

Permalink
First version of Semgrep service
Browse files Browse the repository at this point in the history
  • Loading branch information
kam193 committed May 18, 2024
1 parent 028bdeb commit 0a4311f
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 41 deletions.
18 changes: 13 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ number of badlisted files, it's just a linear comparison.

It performs hash lookups to identify well-known good and bad files. It be used to avoid analyzing well-known
files. Responses are cached. Currently supported services:

- [CIRCL Hashlookup](https://www.circl.lu/services/hashlookup/): identify well-known files and return trust
score. DNS queries are used to check for the hash, and then REST API to get more details. It could be an
online alternative to loading NIST NSRL database (and more) into Safelist service.
Expand All @@ -37,28 +38,35 @@ files. Responses are cached. Currently supported services:

Service to get information about IPs and domains. Currently supported:

- IP data from MMDB files (you can configure your own, the default one is [GeoOpen](https://cra.circl.lu/opendata/geo-open/)),
- WHOIS data for domains, including domains extracted from URIs. Results are cached.
- IP data from MMDB files (you can configure your own, the default one is [GeoOpen](https://cra.circl.lu/opendata/geo-open/)),
- WHOIS data for domains, including domains extracted from URIs. Results are cached.

Supported heuristics:
- newly created domains (based on WHOIS data).

- newly created domains (based on WHOIS data).

### PCAP Extractor

This service list TCP flows from a pcap file using Tshark. If supported by Tshark, it can also extract files.
It tries to set as much as possible tags, and respect safelisting to avoid unnecessary operations.

Supported heuristics:
- external HTTP/non-HTTP connections,
- data exfiltration threshold (based on total data sent out).

- external HTTP/non-HTTP connections,
- data exfiltration threshold (based on total data sent out).

### Python Magic

Designed to help with analysis of Python artifacts. Currently supported:

- unpacking PyInstaller executables (using [pyinstxtractor-ng](https://github.com/pyinstxtractor/pyinstxtractor-ng)),
- decompyling Python bytecode (.pyc) (using [Decompyle++](https://github.com/zrax/pycdc)),
- extracting declared dependencies and matching them against configurable lists of suspicious and malicious packages,

### Semgrep

Service using [Semgrep](https://semgrep.dev) OSS to analyze code for malicious activity. Currently in the alpha stage.

### Simple Downloader

Very simple service to download URLs, without running a whole browser. User-agent can be configured.
Expand Down
4 changes: 3 additions & 1 deletion semgrep/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Semgrep

Service using [Semgrep](https://semgrep.dev) to analyze code for malicious activity.
Service using [Semgrep](https://semgrep.dev) OSS to analyze code for malicious activity.

Currently in the alpha stage. Loads rules on o every check. (TODO: cache rules).
2 changes: 1 addition & 1 deletion semgrep/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0
1
76 changes: 60 additions & 16 deletions semgrep/service/al_run.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import hashlib
import json
import os
import subprocess
import tempfile
from collections import defaultdict
from threading import RLock
from typing import Iterable

import yaml
from assemblyline_v4_service.common.base import ServiceBase
from assemblyline_v4_service.common.request import ServiceRequest
from assemblyline_v4_service.common.result import (
Expand Down Expand Up @@ -34,10 +37,15 @@
"ERROR": 2,
}

RULES_LOCK = RLock()


class AssemblylineService(ServiceBase):
def __init__(self, config=None):
super().__init__(config)
self._active_rules = []
self._active_rules_dir = None
self._active_rules_prefix = ""

def _load_config(self):
self._semgrep_config = {}
Expand All @@ -52,31 +60,65 @@ def start(self):

self.log.info(f"{self.service_attributes.name} service started")

# def _load_rules(self) -> None:
# pass
def _load_rules(self) -> None:
# signature client doesn't support joining to a yaml, so we need to recreate it using our delimiter
new_rules_dir = tempfile.TemporaryDirectory(prefix="semgrep_rules_")
files = []
for source_file in self.rules_list:
rules = []
with open(source_file, "r") as f:
tmp_data = []
for line in f:
if "#SIGNATURE-DELIMITER" in line:
rules.append(yaml.safe_load("\n".join(tmp_data)))
tmp_data = []
else:
tmp_data.append(line)
if tmp_data:
rules.append(yaml.safe_load("\n".join(tmp_data)))
source_name = os.path.basename(source_file)
new_file = os.path.join(new_rules_dir.name, source_name, "rules.yaml")
os.makedirs(os.path.dirname(new_file), exist_ok=True)
with open(new_file, "w") as f:
yaml.dump({"rules": rules}, f, indent=2)
files.append(new_file)

self.log.debug(self.rules_list)
with RULES_LOCK:
self._active_rules = []
self._active_rules_dir, old_rules_dir = new_rules_dir, self._active_rules_dir
for source_file in files:
self._active_rules.append("--config")
self._active_rules.append(source_file)
self._active_rules_prefix = ".".join(self._active_rules_dir.name.split("/"))
if old_rules_dir:
old_rules_dir.cleanup()

def _execute_semgrep(self, file_path: str) -> dict:
cmd = ["semgrep"] + BASE_CONFIG
for option, value in self._semgrep_config.items():
cmd.append(f"--{option}")
cmd.append(value)

result = subprocess.run(
cmd + ["--config", f"{RULES}", file_path],
capture_output=True,
text=True,
timeout=self._cli_timeout,
)
with RULES_LOCK:
result = subprocess.run(
cmd + self._active_rules + [file_path],
capture_output=True,
text=True,
timeout=self._cli_timeout,
)
rules_prefix = self._active_rules_prefix

self.log.debug("Semgrep result: %s", result.stdout)

# Something was found
if result.returncode == 1:
return json.loads(result.stdout)
return json.loads(result.stdout), rules_prefix
elif result.returncode == 0:
return {}
return {}, None
else:
self.log.error("Error running semgrep (%d) %s", result.returncode, result.stderr)
raise RuntimeError(f"Error {result.returncode} running semgrep: {result.stderr[:250]}")
return {}

def _get_code_hash(self, code: str):
Expand All @@ -87,12 +129,15 @@ def _get_code_hash(self, code: str):
code_hash = hashlib.sha256(code.encode()).hexdigest()
return f"code.{code_hash}"

def _process_results(self, results: list[dict]) -> Iterable[ResultMultiSection]:
def _process_results(
self, results: list[dict], rule_prefix: str
) -> Iterable[ResultMultiSection]:
result_by_rule = defaultdict(list)
for result in results:
result_by_rule[result["check_id"]].append(result)

for rule_id, matches in result_by_rule.items():
rule_id = rule_id[len(rule_prefix) :]
extra = matches[0].get("extra", {})
message = extra.get("message", "")
severity = extra.get("severity", "INFO")
Expand All @@ -116,21 +161,20 @@ def _process_results(self, results: list[dict]) -> Iterable[ResultMultiSection]:
tags={"file.rule.semgrep": [code_hash, rule_id]},
)
section.add_tag("file.rule.semgrep", code_hash)
# Looks like heuristic in subsections causes zeroization to fail
# subsection.set_heuristic(heuristic, signature=rule_id, attack_id=attack_id)
yield section

def execute(self, request: ServiceRequest) -> None:
result = Result()
request.result = result

results = self._execute_semgrep(request.file_path)
results, rule_prefix = self._execute_semgrep(request.file_path)
request.set_service_context(f"Semgrep™ OSS {results.get('version', '')}")
if res_list := results.get("results", []):
# main_section = ResultTextSection("Results from Semgrep™ OSS Engine")
# result.add_section(main_section)
for result_section in self._process_results(res_list):
for result_section in self._process_results(res_list, rule_prefix):
result.add_section(result_section)

with tempfile.NamedTemporaryFile("w", delete=False) as f:
json.dump(results, f, indent=2)
request.add_supplementary(f.name, "semgrep_results.json", "Semgrep™ OSS Results")
request.add_supplementary(f.name, "semgrep_results.json", "Semgrep™ OSS Results")
47 changes: 30 additions & 17 deletions semgrep/service/updater.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,68 @@
import os
import pathlib
import subprocess

import yaml
from assemblyline.odm.models.signature import Signature
from assemblyline_v4_service.updater.updater import ServiceUpdater

from .al_run import BASE_CONFIG


class AssemblylineServiceUpdater(ServiceUpdater):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

# By default, the SERVICE_PATH is used to generate name
self.updater_type = "semgrep"
status_query = " OR ".join([f"status:{s}" for s in self.statuses])
self.signatures_query = f"type:{self.updater_type} AND ({status_query})"

self.persistent_dir = pathlib.Path(os.getenv("UPDATER_DIR", "/tmp/updater"))
self.client

# def do_source_update(
# self, service: Service, specific_sources: list[str] = []
# ) -> None:
# pass
def is_valid(self, file_path: str):
# semgrep --validate calls their registry to get linting rules
# as per https://github.com/semgrep/semgrep/blob/73b6cf90c5ac71e001711f98adb72ca4ba8b2f8f/src/metachecking/Check_rule.ml#L44
# they are necessary to validate the rule file
result = subprocess.run(
["semgrep"] + BASE_CONFIG + ["--config", file_path, "--validate"],
capture_output=True,
text=True,
)
if result.returncode != 0:
self.log.error("Error validating semgrep rule file: %s", result.stderr)
return False
return True

# def is_valid(self, file_path) -> bool:
# return True
def _preprocess_rule(self, rule: dict) -> dict:
# In AssemblyLine, there will be no project directory, so we need to remove paths
if "paths" in rule:
del rule["paths"]
return rule

def import_update(self, files_sha256, source, default_classification) -> None:
# output_dir = os.path.join(self.latest_updates_dir, source)
# os.makedirs(os.path.join(self.latest_updates_dir, source), exist_ok=True)
signatures: list[Signature] = []
for file, _ in files_sha256:
with open(file, "r") as f:
rules = yaml.safe_load(f).get("rules", [])

for rule in rules:
rule = self._preprocess_rule(rule)
signature = Signature(
dict(
classification=default_classification,
data=yaml.dump(rule),
data=yaml.dump(rule, indent=2),
name=rule["id"],
source=source,
status="DEPLOYED",
type="semgrep",
type=self.updater_type,
revision=1,
signature_id=rule["id"],
)
)
signatures.append(signature)

self.client.signature.add_update_many(source, "semgrep", signatures)

# def prepare_output_directory(self) -> str:
# tempdir = tempfile.mkdtemp()
# shutil.copytree(self.latest_updates_dir, tempdir, dirs_exist_ok=True)
# return tempdir
self.client.signature.add_update_many(source, self.updater_type, signatures)


if __name__ == "__main__":
Expand Down
4 changes: 3 additions & 1 deletion semgrep/service_manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ docker_config:
update_config:
update_interval_seconds: 86400 # 1 day
generates_signatures: true
signature_delimiter: custom
custom_delimiter: "\n#SIGNATURE-DELIMITER#\n\n"
wait_for_update: true
sources:
- uri: https://gist.githubusercontent.com/kam193/474547be4a37bb7990caa4e26ee542e4/raw/a0d3783f38e156cfda5df63f426bfa8d36cc4027/example_rule.yaml
Expand All @@ -74,4 +76,4 @@ dependencies:
command: ["python", "-m", "service.updater"]
image: ${REGISTRY}ghcr.io/kam193/assemblyline-service-semgrep:$SERVICE_TAG
ports: ["5003"]
run_as_core: True
run_as_core: true

0 comments on commit 0a4311f

Please sign in to comment.