Skip to content

Commit

Permalink
First version of semgrep service
Browse files Browse the repository at this point in the history
  • Loading branch information
kam193 committed May 16, 2024
1 parent 57d746e commit 028bdeb
Show file tree
Hide file tree
Showing 11 changed files with 315 additions and 0 deletions.
4 changes: 4 additions & 0 deletions semgrep/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.env
.randomnotes/
.git/
*.pyc
24 changes: 24 additions & 0 deletions semgrep/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
ARG REGISTRY=
ARG MANIFEST_REGISTRY=ghcr.io/
ARG BASE_IMAGE=cccs/assemblyline-v4-service-base:stable
FROM ${BASE_IMAGE}

ENV SERVICE_PATH service.al_run.AssemblylineService

USER root
RUN apt-get update && apt-get upgrade -y

USER assemblyline
COPY requirements.txt requirements.txt

RUN pip install --no-cache-dir --user --requirement requirements.txt && rm -rf ~/.cache/pip

WORKDIR /opt/al_service
COPY . .

USER root
ARG BASE_TAG=4.5.0.stable
RUN sed -i "s|\(image: \${REGISTRY}\).*\(kam193/.*\)|\1$MANIFEST_REGISTRY\2|g" service_manifest.yml && \
sed -i "s/\$SERVICE_TAG/$BASE_TAG$(cat VERSION)/g" service_manifest.yml

USER assemblyline
4 changes: 4 additions & 0 deletions semgrep/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include ../common.mk

AL_SERVICE_NAME=Semgrep
# SERVICE_NAME=assemblyline-service-template
3 changes: 3 additions & 0 deletions semgrep/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Semgrep

Service using [Semgrep](https://semgrep.dev) to analyze code for malicious activity.
1 change: 1 addition & 0 deletions semgrep/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
2 changes: 2 additions & 0 deletions semgrep/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
assemblyline-v4-service
semgrep
7 changes: 7 additions & 0 deletions semgrep/sample_rules/exec-rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
rules:
- id: exec-call
pattern: exec(...)
message: Executing code dynamically
severity: WARNING
languages:
- python
Empty file added semgrep/service/__init__.py
Empty file.
136 changes: 136 additions & 0 deletions semgrep/service/al_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import hashlib
import json
import subprocess
import tempfile
from collections import defaultdict
from typing import Iterable

from assemblyline_v4_service.common.base import ServiceBase
from assemblyline_v4_service.common.request import ServiceRequest
from assemblyline_v4_service.common.result import (
Result,
ResultMemoryDumpSection,
ResultMultiSection,
ResultTextSection,
)

RULES = "sample_rules/exec-rule.yaml"

BASE_CONFIG = [
"--metrics=off",
"--quiet",
"--error",
"--no-autofix",
"--no-git-ignore",
"--scan-unknown-extensions",
"--disable-version-check",
"--disable-nosem",
"--json",
]

SEVERITY_TO_HEURISTIC = {
"INFO": 3,
"WARNING": 1,
"ERROR": 2,
}


class AssemblylineService(ServiceBase):
def __init__(self, config=None):
super().__init__(config)

def _load_config(self):
self._semgrep_config = {}
self._semgrep_config["timeout"] = str(self.config.get("SEMGREP_RULE_TIMEOUT", 10))
self._semgrep_config["max-memory"] = str(self.config.get("SEMGREP_RAM_LIMIT_MB", 400))

self._cli_timeout = int(self.config.get("SEMGREP_CLI_TIMEOUT", 60))

def start(self):
self.log.info(f"start() from {self.service_attributes.name} service called")
self._load_config()

self.log.info(f"{self.service_attributes.name} service started")

# def _load_rules(self) -> None:
# pass

def _execute_semgrep(self, file_path: str) -> dict:
cmd = ["semgrep"] + BASE_CONFIG
for option, value in self._semgrep_config.items():
cmd.append(f"--{option}")
cmd.append(value)

result = subprocess.run(
cmd + ["--config", f"{RULES}", file_path],
capture_output=True,
text=True,
timeout=self._cli_timeout,
)

self.log.debug("Semgrep result: %s", result.stdout)

# Something was found
if result.returncode == 1:
return json.loads(result.stdout)
elif result.returncode == 0:
return {}
else:
self.log.error("Error running semgrep (%d) %s", result.returncode, result.stderr)
return {}

def _get_code_hash(self, code: str):
code = code or ""
code = code.strip()
if not code:
return ""
code_hash = hashlib.sha256(code.encode()).hexdigest()
return f"code.{code_hash}"

def _process_results(self, results: list[dict]) -> Iterable[ResultMultiSection]:
result_by_rule = defaultdict(list)
for result in results:
result_by_rule[result["check_id"]].append(result)

for rule_id, matches in result_by_rule.items():
extra = matches[0].get("extra", {})
message = extra.get("message", "")
severity = extra.get("severity", "INFO")
heuristic = SEVERITY_TO_HEURISTIC.get(severity.upper(), 0)
metadata = extra.get("metadata", {})
title = metadata.get("title", metadata.get("name", message[:50]))
attack_id = metadata.get("attack_id")
section = ResultTextSection(
title,
zeroize_on_tag_safe=True,
)
section.add_line(message)
section.set_heuristic(heuristic, signature=rule_id, attack_id=attack_id)
for match in matches:
code_hash = self._get_code_hash(match["extra"]["lines"])
ResultMemoryDumpSection(
f"Match at line {match['start']['line']}",
body=match["extra"]["lines"],
parent=section,
zeroize_on_tag_safe=True,
tags={"file.rule.semgrep": [code_hash, rule_id]},
)
section.add_tag("file.rule.semgrep", code_hash)
# subsection.set_heuristic(heuristic, signature=rule_id, attack_id=attack_id)
yield section

def execute(self, request: ServiceRequest) -> None:
result = Result()
request.result = result

results = self._execute_semgrep(request.file_path)
request.set_service_context(f"Semgrep™ OSS {results.get('version', '')}")
if res_list := results.get("results", []):
# main_section = ResultTextSection("Results from Semgrep™ OSS Engine")
# result.add_section(main_section)
for result_section in self._process_results(res_list):
result.add_section(result_section)

with tempfile.NamedTemporaryFile("w", delete=False) as f:
json.dump(results, f, indent=2)
request.add_supplementary(f.name, "semgrep_results.json", "Semgrep™ OSS Results")
57 changes: 57 additions & 0 deletions semgrep/service/updater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import pathlib

import yaml
from assemblyline.odm.models.signature import Signature
from assemblyline_v4_service.updater.updater import ServiceUpdater


class AssemblylineServiceUpdater(ServiceUpdater):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.persistent_dir = pathlib.Path(os.getenv("UPDATER_DIR", "/tmp/updater"))
self.client

# def do_source_update(
# self, service: Service, specific_sources: list[str] = []
# ) -> None:
# pass

# def is_valid(self, file_path) -> bool:
# return True

def import_update(self, files_sha256, source, default_classification) -> None:
# output_dir = os.path.join(self.latest_updates_dir, source)
# os.makedirs(os.path.join(self.latest_updates_dir, source), exist_ok=True)
signatures: list[Signature] = []
for file, _ in files_sha256:
with open(file, "r") as f:
rules = yaml.safe_load(f).get("rules", [])

for rule in rules:
signature = Signature(
dict(
classification=default_classification,
data=yaml.dump(rule),
name=rule["id"],
source=source,
status="DEPLOYED",
type="semgrep",
revision=1,
signature_id=rule["id"],
)
)
signatures.append(signature)

self.client.signature.add_update_many(source, "semgrep", signatures)

# def prepare_output_directory(self) -> str:
# tempdir = tempfile.mkdtemp()
# shutil.copytree(self.latest_updates_dir, tempdir, dirs_exist_ok=True)
# return tempdir


if __name__ == "__main__":
with AssemblylineServiceUpdater(default_pattern=".*") as server:
server.serve_forever()
77 changes: 77 additions & 0 deletions semgrep/service_manifest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: Semgrep
version: $SERVICE_TAG
description: FIXME
enabled: true

accepts: code/*
rejects: empty
stage: CORE
category: Static Analysis
uses_tags: false
file_required: true
timeout: 90
is_external: false

config:
SEMGREP_RAM_LIMIT_MB: "400"
SEMGREP_RULE_TIMEOUT: "10"
SEMGREP_CLI_TIMEOUT: 60

# submission_params:
# - default: "auto"
# name: platform
# type: list
# value: "auto"
# list: ["auto", "linux"]

# -1000: safe
# 0 - 299: informative
# 300 - 699: suspicious
# 700 - 999: highly suspicious
# >= 1000: malicious

heuristics:
- description: Suspicious code pattern
filetype: "*"
heur_id: 1
name: Score
score: 300
max_score: 500
- description: Malicious code pattern
filetype: "*"
heur_id: 2
name: Score
score: 1000
max_score: 2000
- description: Informative code pattern
filetype: "*"
heur_id: 3
name: Score
score: 10
max_score: 50

docker_config:
image: ${REGISTRY}ghcr.io/kam193/assemblyline-service-semgrep:$SERVICE_TAG
cpu_cores: 1.0
ram_mb: 512
ram_mb_min: 256
allow_internet_access: false

update_config:
update_interval_seconds: 86400 # 1 day
generates_signatures: true
wait_for_update: true
sources:
- uri: https://gist.githubusercontent.com/kam193/474547be4a37bb7990caa4e26ee542e4/raw/a0d3783f38e156cfda5df63f426bfa8d36cc4027/example_rule.yaml
name: example_rules

dependencies:
updates:
container:
ram_mb: 512
ram_mb_min: 128
allow_internet_access: true
command: ["python", "-m", "service.updater"]
image: ${REGISTRY}ghcr.io/kam193/assemblyline-service-semgrep:$SERVICE_TAG
ports: ["5003"]
run_as_core: True

0 comments on commit 028bdeb

Please sign in to comment.