Skip to content

Commit

Permalink
feat: add support for 7z files (#35)
Browse files Browse the repository at this point in the history
* feat: add support for 7z files

* feat: update actions versions

* refactor: remove unused imports

* feat(ci): bump actions versions

* fix: PR comments

* fix(ci): install picklescan with 7z dependency in tests
  • Loading branch information
McPatate authored Feb 12, 2025
1 parent 78debf9 commit 25d753f
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 24 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,25 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.

# Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality


# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v2
uses: github/codeql-action/autobuild@v3

# ℹ️ Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
Expand All @@ -58,4 +58,4 @@ jobs:
# ./location_of_script_within_repo/buildscript.sh

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
uses: github/codeql-action/analyze@v3
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ jobs:
publish:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.9"
- name: Install dependencies
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,24 @@ jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: "3.9"
- name: Install packages
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
python -m pip install -e .
python -m pip install -e '.[7z]'
- name: Check code format
run: black --check src tests
- name: Lint with flake8
run: python -m flake8 . --count --show-source --statistics
- name: Test with pytest
run: python -m pytest tests --cov=picklescan --doctest-modules --junitxml=junit/test-results.xml --cov-report=xml --cov-report=html
- name: Archive test results
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: test
path: |
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ requests==2.31.0
aiohttp==3.9.1
black==22.8.0
numpy>1.24.0
py7zr==0.22.0
5 changes: 4 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = picklescan
version = 0.0.19
version = 0.0.20
author = Matthieu Maitre
author_email = [email protected]
description = Security scanner detecting Python Pickle files performing suspicious actions
Expand All @@ -21,6 +21,9 @@ packages = find:
python_requires = >=3.9
install_requires =

[options.extras_require]
7z=py7zr

[options.packages.find]
where = src

Expand Down
59 changes: 52 additions & 7 deletions src/picklescan/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import pickletools
from tarfile import TarError
from tempfile import TemporaryDirectory
from typing import IO, List, Optional, Set, Tuple
import urllib.parse
import zipfile
Expand Down Expand Up @@ -151,7 +152,23 @@ def __str__(self) -> str:
_numpy_file_extensions = {".npy"} # Note: .npz is handled as zip files
_pytorch_file_extensions = {".bin", ".pt", ".pth", ".ckpt"}
_pickle_file_extensions = {".pkl", ".pickle", ".joblib", ".dat", ".data"}
_zip_file_extensions = {".zip", ".npz"}
_zip_file_extensions = {".zip", ".npz", ".7z"}


def _is_7z_file(f: IO[bytes]) -> bool:
read_bytes = []
start = f.tell()

byte = f.read(1)
while byte != b"":
read_bytes.append(byte)
if len(read_bytes) == 6:
break
byte = f.read(1)
f.seek(start)

local_header_magic_number = [b"7", b"z", b"\xbc", b"\xaf", b"\x27", b"\x1c"]
return read_bytes == local_header_magic_number


def _http_get(url) -> bytes:
Expand Down Expand Up @@ -307,12 +324,37 @@ def scan_pickle_bytes(data: IO[bytes], file_id, multiple_pickles=True) -> ScanRe
return _build_scan_result_from_raw_globals(raw_globals, file_id)


# XXX: it appears there is not way to get the byte stream for a given file within the 7z archive and thus forcing us to unzip to disk before scanning
def scan_7z_bytes(data: IO[bytes], file_id) -> ScanResult:
try:
import py7zr
except ImportError:
raise Exception(
"py7zr is required to scan 7z archives, install picklescan using: 'pip install picklescan[7z]'"
)
result = ScanResult([])

with py7zr.SevenZipFile(data, mode="r") as archive:
file_names = archive.getnames()
targets = [f for f in file_names if f.endswith(tuple(_pickle_file_extensions))]
_log.debug("Files in 7z archive %s: %s", file_id, targets)
with TemporaryDirectory() as tmpdir:
archive.extract(path=tmpdir, targets=targets)
for file_name in targets:
file_path = os.path.join(tmpdir, file_name)
_log.debug("Scanning file %s in 7z archive %s", file_name, file_id)
if os.path.isfile(file_path):
result.merge(scan_file_path(file_path))

return result


def scan_zip_bytes(data: IO[bytes], file_id) -> ScanResult:
result = ScanResult([])

with zipfile.ZipFile(data, "r") as zip:
file_names = zip.namelist()
_log.debug("Files in archive %s: %s", file_id, file_names)
_log.debug("Files in zip archive %s: %s", file_id, file_names)
for file_name in file_names:
file_ext = os.path.splitext(file_name)[1]
if file_ext in _pickle_file_extensions:
Expand Down Expand Up @@ -361,6 +403,8 @@ def scan_pytorch(data: IO[bytes], file_id) -> ScanResult:
# new pytorch format
if _is_zipfile(data):
return scan_zip_bytes(data, file_id)
elif _is_7z_file(data):
return scan_7z_bytes(data, file_id)
# old pytorch format
else:
scan_result = ScanResult([])
Expand Down Expand Up @@ -395,11 +439,12 @@ def scan_bytes(data: IO[bytes], file_id, file_ext: Optional[str] = None) -> Scan
else:
is_zip = zipfile.is_zipfile(data)
data.seek(0)
return (
scan_zip_bytes(data, file_id)
if is_zip
else scan_pickle_bytes(data, file_id)
)
if is_zip:
return scan_zip_bytes(data, file_id)
elif _is_7z_file(data):
return scan_7z_bytes(data, file_id)
else:
return scan_pickle_bytes(data, file_id)


def scan_huggingface_model(repo_id):
Expand Down
Binary file added tests/data/malicious1.7z
Binary file not shown.
28 changes: 24 additions & 4 deletions tests/test_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import importlib
import io
import os
import runpy
import pathlib
import pickle
import py7zr
import pytest
import requests
import runpy
import socket
import subprocess
import sys
Expand Down Expand Up @@ -177,6 +179,18 @@ def initialize_data_file(path, data):
file.write(data)


def initialize_7z_file(archive_path, file_name):
file_path = f"{_root_path}/data/malicious1.pkl"
with open(file_path, "wb") as f:
pickle.dump(Malicious1(), f, protocol=4)

if not os.path.exists(archive_path):
with py7zr.SevenZipFile(archive_path, "w") as archive:
archive.write(file_path, file_name)

pathlib.Path.unlink(pathlib.Path(file_path))


def initialize_zip_file(path, file_name, data):
if not os.path.exists(path):
with zipfile.ZipFile(path, "w") as zip:
Expand Down Expand Up @@ -399,6 +413,11 @@ def initialize_pickle_files():
initialize_pickle_file(f"{_root_path}/data/malicious15a.pkl", Malicious15(), 2)
initialize_pickle_file(f"{_root_path}/data/malicious15b.pkl", Malicious15(), 4)

initialize_7z_file(
f"{_root_path}/data/malicious1.7z",
"data.pkl",
)

initialize_zip_file(
f"{_root_path}/data/malicious1.zip",
"data.pkl",
Expand Down Expand Up @@ -732,10 +751,11 @@ def test_scan_directory_path():
Global("bdb", "Bdb", SafetyLevel.Dangerous),
Global("bdb", "Bdb.run", SafetyLevel.Dangerous),
Global("builtins", "exec", SafetyLevel.Dangerous),
Global("builtins", "eval", SafetyLevel.Dangerous),
],
scanned_files=31,
issues_count=31,
infected_files=26,
scanned_files=32,
issues_count=32,
infected_files=27,
scan_err=True,
)
compare_scan_results(scan_directory_path(f"{_root_path}/data/"), sr)
Expand Down

0 comments on commit 25d753f

Please sign in to comment.