Skip to content

Commit

Permalink
Support gzf-export
Browse files Browse the repository at this point in the history
  • Loading branch information
clearbluejar committed Dec 14, 2024
1 parent df57551 commit 16fa76a
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 20 deletions.
2 changes: 1 addition & 1 deletion ghidrecomp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.5.4'
__version__ = '0.5.5'
__author__ = 'clearbluejar'

# Expose API
Expand Down
52 changes: 36 additions & 16 deletions ghidrecomp/decompile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import hashlib
from pyhidra import HeadlessPyhidraLauncher, open_program

from .utility import set_pdb, setup_symbol_server, set_remote_pdbs, analyze_program, get_pdb, apply_gdt
from .utility import set_pdb, setup_symbol_server, set_remote_pdbs, analyze_program, get_pdb, apply_gdt, save_program_as_gzf
from .callgraph import get_called, get_calling, CallGraph, gen_callgraph
from .bsim import gen_bsim_sigs_for_program,has_bsim
from .bsim import gen_bsim_sigs_for_program, has_bsim

# needed for ghidra python vscode autocomplete
if TYPE_CHECKING:
Expand All @@ -23,6 +23,7 @@
def get_filename(func: 'ghidra.program.model.listing.Function'):
return f'{func.getName()[:MAX_PATH_LEN]}-{func.entryPoint}'


def get_md5_file_digest(path: str) -> str:
# https://stackoverflow.com/questions/22058048/hashing-a-file-in-python
# BUF_SIZE is totally arbitrary, change for your app!
Expand All @@ -41,17 +42,20 @@ def get_md5_file_digest(path: str) -> str:

return f'{md5.hexdigest()}'


def gen_proj_bin_name_from_path(path: Path):
"""
Generate unique project name from binary for Ghidra Project
"""

return '-'.join((path.name, get_md5_file_digest(path.absolute())))


def get_bin_output_path(output_path: Path, bin_name: str):

return Path(output_path) / 'bins' / bin_name


def setup_decompliers(program: "ghidra.program.model.listing.Program", thread_count: int = 2) -> dict:
"""
Setup decompliers to use during diff bins. Each one must be initialized with a program.
Expand Down Expand Up @@ -120,37 +124,44 @@ def decompile_to_single_file(path: Path,
monitor = ConsoleTaskMonitor().DUMMY

try:
# Ghidra CppExporter before 10.3.3 and later
decompiler = CppExporter(None,create_header, create_file, emit_types, exclude_tags, tags)
# Ghidra CppExporter before 10.3.3 and later
decompiler = CppExporter(None, create_header, create_file, emit_types, exclude_tags, tags)
except TypeError:
# Ghidra CppExporter before 10.3.3
decompiler = CppExporter(create_header, create_file, emit_types, exclude_tags, tags)

decompiler.export(c_file, prog, prog.getMemory(), monitor)



def decompile(args: Namespace):

print(f'Starting decompliations: {args}')

bin_path = Path(args.bin)
bin_proj_name = gen_proj_bin_name_from_path(bin_path)
bin_proj_name = gen_proj_bin_name_from_path(bin_path)
thread_count = args.thread_count

output_path = Path(args.output_path)
bin_output_path = get_bin_output_path(output_path, bin_proj_name)
bin_output_path = get_bin_output_path(output_path, bin_proj_name)
decomp_path = bin_output_path / 'decomps'
output_path.mkdir(exist_ok=True, parents=True)
bin_output_path.mkdir(exist_ok=True, parents=True)
decomp_path.mkdir(exist_ok=True, parents=True)


if args.project_path == 'ghidra_projects':
project_location = output_path / args.project_path
else:
project_location = Path(args.project_path)

gzf_path = None
if args.gzf:
if args.gzf_path == 'gzfs':
gzf_path = output_path / args.gzf_path
else:
gzf_path = Path(args.gzf_path)

gzf_path.mkdir(exist_ok=True, parents=True)

if args.symbols_path == 'symbols':
symbols_path = output_path / args.symbols_path
else:
Expand All @@ -161,7 +172,6 @@ def decompile(args: Namespace):
else:
bsim_sig_path = output_path / Path(args.bsim_sig_path)


# turn on verbose
launcher = HeadlessPyhidraLauncher(True)

Expand Down Expand Up @@ -204,7 +214,18 @@ def decompile(args: Namespace):
print(f'Using file gdts: {gdt_names}')

# analyze program if we haven't yet
analyze_program(program, verbose=args.va, force_analysis=args.fa)
analyze_program(program, verbose=args.va, force_analysis=args.fa, gzf_path=gzf_path)

# Save copy of program in gzf after analysis
if args.gzf:
from ghidra.base.project import GhidraProject
try:
project = GhidraProject.openProject(Path(project_location / bin_proj_name), bin_proj_name, True)
program = project.openProgram("/", bin_path.name, True)
save_program_as_gzf(program, gzf_path / bin_proj_name, project)
finally:
project.close(program)
project.close()

# decompile and callgraph all the things
with open_program(bin_path, project_location=project_location, project_name=bin_proj_name, analyze=False) as flat_api:
Expand Down Expand Up @@ -291,7 +312,7 @@ def decompile(args: Namespace):
max_display_depth = None
if args.max_display_depth is not None:
max_display_depth = int(args.max_display_depth)

with concurrent.futures.ThreadPoolExecutor(max_workers=thread_count) as executor:
futures = (executor.submit(gen_callgraph, func, max_display_depth, direction, args.max_time_cg_gen, get_filename(func))
for direction in directions for func in all_funcs if args.skip_cache or get_filename(func) not in callgraphs_completed and re.search(args.callgraph_filter, func.name) is not None)
Expand All @@ -315,20 +336,19 @@ def decompile(args: Namespace):
print(f'Wrote {completed} callgraphs for {program.name} to {callgraph_path} in {time() - start}')
print(f'{len(all_funcs) - completed} callgraphs already existed.')


# BSim
gensig = None
manager = None
if args.bsim:

if has_bsim():
start = time()
print(f'Generating BSim sigs for {len(all_funcs)} functions for {program.name}')
sig_name, func_count, cat_count = gen_bsim_sigs_for_program(program,bsim_sig_path,args.bsim_template,args.bsim_cat,all_funcs)
sig_name, func_count, cat_count = gen_bsim_sigs_for_program(
program, bsim_sig_path, args.bsim_template, args.bsim_cat, all_funcs)
print(f'Generated BSim sigs for {func_count} functions in {time() - start}')
print(f'Sigs are in {bsim_sig_path / sig_name}')
else:
print('WARN: Skipping BSim. BSim not present')



return (all_funcs, decompilations, bin_output_path, str(program.compiler), str(program.languageID), callgraphs)
6 changes: 4 additions & 2 deletions ghidrecomp/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

THREAD_COUNT = multiprocessing.cpu_count()


def get_parser() -> argparse.ArgumentParser:

parser = argparse.ArgumentParser(description='ghidrecomp - A Command Line Ghidra Decompiler',
Expand All @@ -17,6 +18,8 @@ def get_parser() -> argparse.ArgumentParser:
parser.add_argument('--cppexport', action='store_true', help='Use Ghidras CppExporter to decompile to single file')
parser.add_argument('--filter', dest='filters', action='append', help='Regex match for function name')
parser.add_argument('--project-path', help='Path to base ghidra projects ', default='ghidra_projects')
parser.add_argument('--gzf', help='Export gzf of analyzed project', action='store_true')
parser.add_argument('--gzf-path', help='Path to store gzf of analyzed project', default='gzfs')
parser.add_argument('--gdt', help='Additional GDT to apply', nargs='?', action='append')
parser.add_argument('-o', '--output-path', help='Location for all decompilations', default='ghidrecomps')
parser.add_argument("-v", "--version", action="version", version=__version__)
Expand All @@ -40,6 +43,5 @@ def get_parser() -> argparse.ArgumentParser:
add_cg_args_to_parser(parser)

add_bsim_args_to_parser(parser)


return parser
return parser
14 changes: 13 additions & 1 deletion ghidrecomp/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ghidra_builtins import * # noqa: F403


def analyze_program(program, verbose: bool = False, force_analysis: bool = False, save: bool = False):
def analyze_program(program: "ghidra.program.model.listing.Program", verbose: bool = False, force_analysis: bool = False, save: bool = False, gzf_path: Path = None):
"""
Modified pyhidra.core._analyze_program
"""
Expand Down Expand Up @@ -47,6 +47,18 @@ def analyze_program(program, verbose: bool = False, force_analysis: bool = False
print(f'{program} already analyzed... skipping')


def save_program_as_gzf(program: "ghidra.program.model.listing.Program", gzf_path: Path, project):
from java.io import File
from ghidra.base.project import GhidraProject
# from java.io import IOException
print(f'Saving gzf archive to {gzf_path}.gzf')

# GhidraProject.saveAsPackedFile(program, File(f'{gzf_path.absolute()},{program.name}.gzf'), True)
from java.io import File
# project.close()
project.saveAsPackedFile(program, File(f'{gzf_path}.gzf'), True)


def setup_symbol_server(symbols_path: Union[str, Path], level=1, server_urls=None) -> None:
"""
setup symbols to allow Ghidra to download as needed
Expand Down
55 changes: 55 additions & 0 deletions tests/test_gzf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest
from pathlib import Path

import pyhidra
from ghidrecomp import decompile, get_parser
from ghidrecomp.decompile import get_bin_output_path, gen_proj_bin_name_from_path
from ghidrecomp.bsim import has_bsim, add_bsim_args_to_parser, add_categories_to_prog
from pyhidra import HeadlessPyhidraLauncher
from pathlib import Path


def test_gzf_created(shared_datadir: Path):

parser = get_parser()

bin_path = shared_datadir / 'ls_aarch64'

args = parser.parse_args([f"{bin_path.absolute()}", "--filter", "ctype", "--skip-cache", "--gzf"])

bin_proj_name = gen_proj_bin_name_from_path(bin_path)
expected_output_path = get_bin_output_path(args.output_path, bin_proj_name)

gzf_output_path = Path(args.output_path) / 'gzfs' / f"{bin_proj_name}.gzf"

all_funcs, decompilations, output_path, compiler, lang_id, callgraphs = decompile(args)

assert len(all_funcs) == 8
assert len(decompilations) == 8
assert len(callgraphs) == 0
assert expected_output_path == output_path
assert gzf_output_path.exists()


def test_gzf_created_with_path(shared_datadir: Path):

parser = get_parser()

bin_path = shared_datadir / 'ls_aarch64'
gzf_custom_path = shared_datadir / "custom_gzf"

args = parser.parse_args([f"{bin_path.absolute()}", "--filter", "ctype", "--skip-cache",
"--gzf", "--gzf-path", str(gzf_custom_path)])

bin_proj_name = gen_proj_bin_name_from_path(bin_path)
expected_output_path = get_bin_output_path(args.output_path, bin_proj_name)

gzf_output_path = gzf_custom_path / f"{bin_proj_name}.gzf"

all_funcs, decompilations, output_path, compiler, lang_id, callgraphs = decompile(args)

assert len(all_funcs) == 8
assert len(decompilations) == 8
assert len(callgraphs) == 0
assert expected_output_path == output_path
assert gzf_custom_path.exists()

0 comments on commit 16fa76a

Please sign in to comment.