Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gzf export3 #34

Merged
merged 3 commits into from
Dec 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ghidrecomp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.5.4'
__version__ = '0.5.5'
__author__ = 'clearbluejar'

# Expose API
Expand Down
52 changes: 36 additions & 16 deletions ghidrecomp/decompile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import hashlib
from pyhidra import HeadlessPyhidraLauncher, open_program

from .utility import set_pdb, setup_symbol_server, set_remote_pdbs, analyze_program, get_pdb, apply_gdt
from .utility import set_pdb, setup_symbol_server, set_remote_pdbs, analyze_program, get_pdb, apply_gdt, save_program_as_gzf
from .callgraph import get_called, get_calling, CallGraph, gen_callgraph
from .bsim import gen_bsim_sigs_for_program,has_bsim
from .bsim import gen_bsim_sigs_for_program, has_bsim

# needed for ghidra python vscode autocomplete
if TYPE_CHECKING:
Expand All @@ -23,6 +23,7 @@
def get_filename(func: 'ghidra.program.model.listing.Function'):
return f'{func.getName()[:MAX_PATH_LEN]}-{func.entryPoint}'


def get_md5_file_digest(path: str) -> str:
# https://stackoverflow.com/questions/22058048/hashing-a-file-in-python
# BUF_SIZE is totally arbitrary, change for your app!
Expand All @@ -41,17 +42,20 @@ def get_md5_file_digest(path: str) -> str:

return f'{md5.hexdigest()}'


def gen_proj_bin_name_from_path(path: Path):
"""
Generate unique project name from binary for Ghidra Project
"""

return '-'.join((path.name, get_md5_file_digest(path.absolute())))


def get_bin_output_path(output_path: Path, bin_name: str):

return Path(output_path) / 'bins' / bin_name


def setup_decompliers(program: "ghidra.program.model.listing.Program", thread_count: int = 2) -> dict:
"""
Setup decompliers to use during diff bins. Each one must be initialized with a program.
Expand Down Expand Up @@ -120,37 +124,44 @@ def decompile_to_single_file(path: Path,
monitor = ConsoleTaskMonitor().DUMMY

try:
# Ghidra CppExporter before 10.3.3 and later
decompiler = CppExporter(None,create_header, create_file, emit_types, exclude_tags, tags)
# Ghidra CppExporter before 10.3.3 and later
decompiler = CppExporter(None, create_header, create_file, emit_types, exclude_tags, tags)
except TypeError:
# Ghidra CppExporter before 10.3.3
decompiler = CppExporter(create_header, create_file, emit_types, exclude_tags, tags)

decompiler.export(c_file, prog, prog.getMemory(), monitor)



def decompile(args: Namespace):

print(f'Starting decompliations: {args}')

bin_path = Path(args.bin)
bin_proj_name = gen_proj_bin_name_from_path(bin_path)
bin_proj_name = gen_proj_bin_name_from_path(bin_path)
thread_count = args.thread_count

output_path = Path(args.output_path)
bin_output_path = get_bin_output_path(output_path, bin_proj_name)
bin_output_path = get_bin_output_path(output_path, bin_proj_name)
decomp_path = bin_output_path / 'decomps'
output_path.mkdir(exist_ok=True, parents=True)
bin_output_path.mkdir(exist_ok=True, parents=True)
decomp_path.mkdir(exist_ok=True, parents=True)


if args.project_path == 'ghidra_projects':
project_location = output_path / args.project_path
else:
project_location = Path(args.project_path)

gzf_path = None
if args.gzf:
if args.gzf_path == 'gzfs':
gzf_path = output_path / args.gzf_path
else:
gzf_path = Path(args.gzf_path)

gzf_path.mkdir(exist_ok=True, parents=True)

if args.symbols_path == 'symbols':
symbols_path = output_path / args.symbols_path
else:
Expand All @@ -161,7 +172,6 @@ def decompile(args: Namespace):
else:
bsim_sig_path = output_path / Path(args.bsim_sig_path)


# turn on verbose
launcher = HeadlessPyhidraLauncher(True)

Expand Down Expand Up @@ -204,7 +214,18 @@ def decompile(args: Namespace):
print(f'Using file gdts: {gdt_names}')

# analyze program if we haven't yet
analyze_program(program, verbose=args.va, force_analysis=args.fa)
analyze_program(program, verbose=args.va, force_analysis=args.fa, gzf_path=gzf_path)

# Save copy of program in gzf after analysis
if args.gzf:
from ghidra.base.project import GhidraProject
try:
project = GhidraProject.openProject(Path(project_location / bin_proj_name), bin_proj_name, True)
program = project.openProgram("/", bin_path.name, True)
save_program_as_gzf(program, gzf_path / bin_proj_name, project)
finally:
project.close(program)
project.close()

# decompile and callgraph all the things
with open_program(bin_path, project_location=project_location, project_name=bin_proj_name, analyze=False) as flat_api:
Expand Down Expand Up @@ -291,7 +312,7 @@ def decompile(args: Namespace):
max_display_depth = None
if args.max_display_depth is not None:
max_display_depth = int(args.max_display_depth)

with concurrent.futures.ThreadPoolExecutor(max_workers=thread_count) as executor:
futures = (executor.submit(gen_callgraph, func, max_display_depth, direction, args.max_time_cg_gen, get_filename(func))
for direction in directions for func in all_funcs if args.skip_cache or get_filename(func) not in callgraphs_completed and re.search(args.callgraph_filter, func.name) is not None)
Expand All @@ -315,20 +336,19 @@ def decompile(args: Namespace):
print(f'Wrote {completed} callgraphs for {program.name} to {callgraph_path} in {time() - start}')
print(f'{len(all_funcs) - completed} callgraphs already existed.')


# BSim
gensig = None
manager = None
if args.bsim:

if has_bsim():
start = time()
print(f'Generating BSim sigs for {len(all_funcs)} functions for {program.name}')
sig_name, func_count, cat_count = gen_bsim_sigs_for_program(program,bsim_sig_path,args.bsim_template,args.bsim_cat,all_funcs)
sig_name, func_count, cat_count = gen_bsim_sigs_for_program(
program, bsim_sig_path, args.bsim_template, args.bsim_cat, all_funcs)
print(f'Generated BSim sigs for {func_count} functions in {time() - start}')
print(f'Sigs are in {bsim_sig_path / sig_name}')
else:
print('WARN: Skipping BSim. BSim not present')



return (all_funcs, decompilations, bin_output_path, str(program.compiler), str(program.languageID), callgraphs)
6 changes: 4 additions & 2 deletions ghidrecomp/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

THREAD_COUNT = multiprocessing.cpu_count()


def get_parser() -> argparse.ArgumentParser:

parser = argparse.ArgumentParser(description='ghidrecomp - A Command Line Ghidra Decompiler',
Expand All @@ -17,6 +18,8 @@ def get_parser() -> argparse.ArgumentParser:
parser.add_argument('--cppexport', action='store_true', help='Use Ghidras CppExporter to decompile to single file')
parser.add_argument('--filter', dest='filters', action='append', help='Regex match for function name')
parser.add_argument('--project-path', help='Path to base ghidra projects ', default='ghidra_projects')
parser.add_argument('--gzf', help='Export gzf of analyzed project', action='store_true')
parser.add_argument('--gzf-path', help='Path to store gzf of analyzed project', default='gzfs')
parser.add_argument('--gdt', help='Additional GDT to apply', nargs='?', action='append')
parser.add_argument('-o', '--output-path', help='Location for all decompilations', default='ghidrecomps')
parser.add_argument("-v", "--version", action="version", version=__version__)
Expand All @@ -40,6 +43,5 @@ def get_parser() -> argparse.ArgumentParser:
add_cg_args_to_parser(parser)

add_bsim_args_to_parser(parser)


return parser
return parser
14 changes: 13 additions & 1 deletion ghidrecomp/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ghidra_builtins import * # noqa: F403


def analyze_program(program, verbose: bool = False, force_analysis: bool = False, save: bool = False):
def analyze_program(program: "ghidra.program.model.listing.Program", verbose: bool = False, force_analysis: bool = False, save: bool = False, gzf_path: Path = None):
"""
Modified pyhidra.core._analyze_program
"""
Expand Down Expand Up @@ -47,6 +47,18 @@ def analyze_program(program, verbose: bool = False, force_analysis: bool = False
print(f'{program} already analyzed... skipping')


def save_program_as_gzf(program: "ghidra.program.model.listing.Program", gzf_path: Path, project):
from java.io import File
from ghidra.base.project import GhidraProject
# from java.io import IOException
print(f'Saving gzf archive to {gzf_path}.gzf')

# GhidraProject.saveAsPackedFile(program, File(f'{gzf_path.absolute()},{program.name}.gzf'), True)
from java.io import File
# project.close()
project.saveAsPackedFile(program, File(f'{gzf_path}.gzf'), True)


def setup_symbol_server(symbols_path: Union[str, Path], level=1, server_urls=None) -> None:
"""
setup symbols to allow Ghidra to download as needed
Expand Down
55 changes: 55 additions & 0 deletions tests/test_gzf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest
from pathlib import Path

import pyhidra
from ghidrecomp import decompile, get_parser
from ghidrecomp.decompile import get_bin_output_path, gen_proj_bin_name_from_path
from ghidrecomp.bsim import has_bsim, add_bsim_args_to_parser, add_categories_to_prog
from pyhidra import HeadlessPyhidraLauncher
from pathlib import Path


def test_gzf_created(shared_datadir: Path):

parser = get_parser()

bin_path = shared_datadir / 'ls_aarch64'

args = parser.parse_args([f"{bin_path.absolute()}", "--filter", "ctype", "--skip-cache", "--gzf"])

bin_proj_name = gen_proj_bin_name_from_path(bin_path)
expected_output_path = get_bin_output_path(args.output_path, bin_proj_name)

gzf_output_path = Path(args.output_path) / 'gzfs' / f"{bin_proj_name}.gzf"

all_funcs, decompilations, output_path, compiler, lang_id, callgraphs = decompile(args)

assert len(all_funcs) == 8
assert len(decompilations) == 8
assert len(callgraphs) == 0
assert expected_output_path == output_path
assert gzf_output_path.exists()


def test_gzf_created_with_path(shared_datadir: Path):

parser = get_parser()

bin_path = shared_datadir / 'ls_aarch64'
gzf_custom_path = shared_datadir / "custom_gzf"

args = parser.parse_args([f"{bin_path.absolute()}", "--filter", "ctype", "--skip-cache",
"--gzf", "--gzf-path", str(gzf_custom_path)])

bin_proj_name = gen_proj_bin_name_from_path(bin_path)
expected_output_path = get_bin_output_path(args.output_path, bin_proj_name)

gzf_output_path = gzf_custom_path / f"{bin_proj_name}.gzf"

all_funcs, decompilations, output_path, compiler, lang_id, callgraphs = decompile(args)

assert len(all_funcs) == 8
assert len(decompilations) == 8
assert len(callgraphs) == 0
assert expected_output_path == output_path
assert gzf_custom_path.exists()
Loading