From 16fa76ae89dc6e9db5b9a5c0fe8e448b402060fc Mon Sep 17 00:00:00 2001 From: clearbluejar <3752074+clearbluejar@users.noreply.github.com> Date: Fri, 13 Dec 2024 23:45:36 +0000 Subject: [PATCH] Support gzf-export --- ghidrecomp/__init__.py | 2 +- ghidrecomp/decompile.py | 52 ++++++++++++++++++++++++++------------ ghidrecomp/parser.py | 6 +++-- ghidrecomp/utility.py | 14 ++++++++++- tests/test_gzf.py | 55 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 109 insertions(+), 20 deletions(-) create mode 100644 tests/test_gzf.py diff --git a/ghidrecomp/__init__.py b/ghidrecomp/__init__.py index 0d507b8..e59a45e 100644 --- a/ghidrecomp/__init__.py +++ b/ghidrecomp/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.5.4' +__version__ = '0.5.5' __author__ = 'clearbluejar' # Expose API diff --git a/ghidrecomp/decompile.py b/ghidrecomp/decompile.py index 24cf91c..1f6e2bd 100644 --- a/ghidrecomp/decompile.py +++ b/ghidrecomp/decompile.py @@ -8,9 +8,9 @@ import hashlib from pyhidra import HeadlessPyhidraLauncher, open_program -from .utility import set_pdb, setup_symbol_server, set_remote_pdbs, analyze_program, get_pdb, apply_gdt +from .utility import set_pdb, setup_symbol_server, set_remote_pdbs, analyze_program, get_pdb, apply_gdt, save_program_as_gzf from .callgraph import get_called, get_calling, CallGraph, gen_callgraph -from .bsim import gen_bsim_sigs_for_program,has_bsim +from .bsim import gen_bsim_sigs_for_program, has_bsim # needed for ghidra python vscode autocomplete if TYPE_CHECKING: @@ -23,6 +23,7 @@ def get_filename(func: 'ghidra.program.model.listing.Function'): return f'{func.getName()[:MAX_PATH_LEN]}-{func.entryPoint}' + def get_md5_file_digest(path: str) -> str: # https://stackoverflow.com/questions/22058048/hashing-a-file-in-python # BUF_SIZE is totally arbitrary, change for your app! @@ -41,6 +42,7 @@ def get_md5_file_digest(path: str) -> str: return f'{md5.hexdigest()}' + def gen_proj_bin_name_from_path(path: Path): """ Generate unique project name from binary for Ghidra Project @@ -48,10 +50,12 @@ def gen_proj_bin_name_from_path(path: Path): return '-'.join((path.name, get_md5_file_digest(path.absolute()))) + def get_bin_output_path(output_path: Path, bin_name: str): return Path(output_path) / 'bins' / bin_name + def setup_decompliers(program: "ghidra.program.model.listing.Program", thread_count: int = 2) -> dict: """ Setup decompliers to use during diff bins. Each one must be initialized with a program. @@ -120,8 +124,8 @@ def decompile_to_single_file(path: Path, monitor = ConsoleTaskMonitor().DUMMY try: - # Ghidra CppExporter before 10.3.3 and later - decompiler = CppExporter(None,create_header, create_file, emit_types, exclude_tags, tags) + # Ghidra CppExporter before 10.3.3 and later + decompiler = CppExporter(None, create_header, create_file, emit_types, exclude_tags, tags) except TypeError: # Ghidra CppExporter before 10.3.3 decompiler = CppExporter(create_header, create_file, emit_types, exclude_tags, tags) @@ -129,28 +133,35 @@ def decompile_to_single_file(path: Path, decompiler.export(c_file, prog, prog.getMemory(), monitor) - def decompile(args: Namespace): print(f'Starting decompliations: {args}') bin_path = Path(args.bin) - bin_proj_name = gen_proj_bin_name_from_path(bin_path) + bin_proj_name = gen_proj_bin_name_from_path(bin_path) thread_count = args.thread_count output_path = Path(args.output_path) - bin_output_path = get_bin_output_path(output_path, bin_proj_name) + bin_output_path = get_bin_output_path(output_path, bin_proj_name) decomp_path = bin_output_path / 'decomps' output_path.mkdir(exist_ok=True, parents=True) bin_output_path.mkdir(exist_ok=True, parents=True) decomp_path.mkdir(exist_ok=True, parents=True) - if args.project_path == 'ghidra_projects': project_location = output_path / args.project_path else: project_location = Path(args.project_path) + gzf_path = None + if args.gzf: + if args.gzf_path == 'gzfs': + gzf_path = output_path / args.gzf_path + else: + gzf_path = Path(args.gzf_path) + + gzf_path.mkdir(exist_ok=True, parents=True) + if args.symbols_path == 'symbols': symbols_path = output_path / args.symbols_path else: @@ -161,7 +172,6 @@ def decompile(args: Namespace): else: bsim_sig_path = output_path / Path(args.bsim_sig_path) - # turn on verbose launcher = HeadlessPyhidraLauncher(True) @@ -204,7 +214,18 @@ def decompile(args: Namespace): print(f'Using file gdts: {gdt_names}') # analyze program if we haven't yet - analyze_program(program, verbose=args.va, force_analysis=args.fa) + analyze_program(program, verbose=args.va, force_analysis=args.fa, gzf_path=gzf_path) + + # Save copy of program in gzf after analysis + if args.gzf: + from ghidra.base.project import GhidraProject + try: + project = GhidraProject.openProject(Path(project_location / bin_proj_name), bin_proj_name, True) + program = project.openProgram("/", bin_path.name, True) + save_program_as_gzf(program, gzf_path / bin_proj_name, project) + finally: + project.close(program) + project.close() # decompile and callgraph all the things with open_program(bin_path, project_location=project_location, project_name=bin_proj_name, analyze=False) as flat_api: @@ -291,7 +312,7 @@ def decompile(args: Namespace): max_display_depth = None if args.max_display_depth is not None: max_display_depth = int(args.max_display_depth) - + with concurrent.futures.ThreadPoolExecutor(max_workers=thread_count) as executor: futures = (executor.submit(gen_callgraph, func, max_display_depth, direction, args.max_time_cg_gen, get_filename(func)) for direction in directions for func in all_funcs if args.skip_cache or get_filename(func) not in callgraphs_completed and re.search(args.callgraph_filter, func.name) is not None) @@ -315,20 +336,19 @@ def decompile(args: Namespace): print(f'Wrote {completed} callgraphs for {program.name} to {callgraph_path} in {time() - start}') print(f'{len(all_funcs) - completed} callgraphs already existed.') - # BSim gensig = None manager = None if args.bsim: - + if has_bsim(): start = time() print(f'Generating BSim sigs for {len(all_funcs)} functions for {program.name}') - sig_name, func_count, cat_count = gen_bsim_sigs_for_program(program,bsim_sig_path,args.bsim_template,args.bsim_cat,all_funcs) + sig_name, func_count, cat_count = gen_bsim_sigs_for_program( + program, bsim_sig_path, args.bsim_template, args.bsim_cat, all_funcs) print(f'Generated BSim sigs for {func_count} functions in {time() - start}') print(f'Sigs are in {bsim_sig_path / sig_name}') else: print('WARN: Skipping BSim. BSim not present') - - + return (all_funcs, decompilations, bin_output_path, str(program.compiler), str(program.languageID), callgraphs) diff --git a/ghidrecomp/parser.py b/ghidrecomp/parser.py index ebfd4bf..e7299d9 100644 --- a/ghidrecomp/parser.py +++ b/ghidrecomp/parser.py @@ -8,6 +8,7 @@ THREAD_COUNT = multiprocessing.cpu_count() + def get_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description='ghidrecomp - A Command Line Ghidra Decompiler', @@ -17,6 +18,8 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument('--cppexport', action='store_true', help='Use Ghidras CppExporter to decompile to single file') parser.add_argument('--filter', dest='filters', action='append', help='Regex match for function name') parser.add_argument('--project-path', help='Path to base ghidra projects ', default='ghidra_projects') + parser.add_argument('--gzf', help='Export gzf of analyzed project', action='store_true') + parser.add_argument('--gzf-path', help='Path to store gzf of analyzed project', default='gzfs') parser.add_argument('--gdt', help='Additional GDT to apply', nargs='?', action='append') parser.add_argument('-o', '--output-path', help='Location for all decompilations', default='ghidrecomps') parser.add_argument("-v", "--version", action="version", version=__version__) @@ -40,6 +43,5 @@ def get_parser() -> argparse.ArgumentParser: add_cg_args_to_parser(parser) add_bsim_args_to_parser(parser) - - return parser \ No newline at end of file + return parser diff --git a/ghidrecomp/utility.py b/ghidrecomp/utility.py index 591e36f..3024303 100644 --- a/ghidrecomp/utility.py +++ b/ghidrecomp/utility.py @@ -8,7 +8,7 @@ from ghidra_builtins import * # noqa: F403 -def analyze_program(program, verbose: bool = False, force_analysis: bool = False, save: bool = False): +def analyze_program(program: "ghidra.program.model.listing.Program", verbose: bool = False, force_analysis: bool = False, save: bool = False, gzf_path: Path = None): """ Modified pyhidra.core._analyze_program """ @@ -47,6 +47,18 @@ def analyze_program(program, verbose: bool = False, force_analysis: bool = False print(f'{program} already analyzed... skipping') +def save_program_as_gzf(program: "ghidra.program.model.listing.Program", gzf_path: Path, project): + from java.io import File + from ghidra.base.project import GhidraProject + # from java.io import IOException + print(f'Saving gzf archive to {gzf_path}.gzf') + + # GhidraProject.saveAsPackedFile(program, File(f'{gzf_path.absolute()},{program.name}.gzf'), True) + from java.io import File + # project.close() + project.saveAsPackedFile(program, File(f'{gzf_path}.gzf'), True) + + def setup_symbol_server(symbols_path: Union[str, Path], level=1, server_urls=None) -> None: """ setup symbols to allow Ghidra to download as needed diff --git a/tests/test_gzf.py b/tests/test_gzf.py new file mode 100644 index 0000000..b217f43 --- /dev/null +++ b/tests/test_gzf.py @@ -0,0 +1,55 @@ +import pytest +from pathlib import Path + +import pyhidra +from ghidrecomp import decompile, get_parser +from ghidrecomp.decompile import get_bin_output_path, gen_proj_bin_name_from_path +from ghidrecomp.bsim import has_bsim, add_bsim_args_to_parser, add_categories_to_prog +from pyhidra import HeadlessPyhidraLauncher +from pathlib import Path + + +def test_gzf_created(shared_datadir: Path): + + parser = get_parser() + + bin_path = shared_datadir / 'ls_aarch64' + + args = parser.parse_args([f"{bin_path.absolute()}", "--filter", "ctype", "--skip-cache", "--gzf"]) + + bin_proj_name = gen_proj_bin_name_from_path(bin_path) + expected_output_path = get_bin_output_path(args.output_path, bin_proj_name) + + gzf_output_path = Path(args.output_path) / 'gzfs' / f"{bin_proj_name}.gzf" + + all_funcs, decompilations, output_path, compiler, lang_id, callgraphs = decompile(args) + + assert len(all_funcs) == 8 + assert len(decompilations) == 8 + assert len(callgraphs) == 0 + assert expected_output_path == output_path + assert gzf_output_path.exists() + + +def test_gzf_created_with_path(shared_datadir: Path): + + parser = get_parser() + + bin_path = shared_datadir / 'ls_aarch64' + gzf_custom_path = shared_datadir / "custom_gzf" + + args = parser.parse_args([f"{bin_path.absolute()}", "--filter", "ctype", "--skip-cache", + "--gzf", "--gzf-path", str(gzf_custom_path)]) + + bin_proj_name = gen_proj_bin_name_from_path(bin_path) + expected_output_path = get_bin_output_path(args.output_path, bin_proj_name) + + gzf_output_path = gzf_custom_path / f"{bin_proj_name}.gzf" + + all_funcs, decompilations, output_path, compiler, lang_id, callgraphs = decompile(args) + + assert len(all_funcs) == 8 + assert len(decompilations) == 8 + assert len(callgraphs) == 0 + assert expected_output_path == output_path + assert gzf_custom_path.exists()