-
Notifications
You must be signed in to change notification settings - Fork 0
/
graphity.py
900 lines (674 loc) · 28.4 KB
/
graphity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
#!/usr/bin/env python
import r2pipe
import sys
import os
import json
import re
import networkx as nx
from time import time
from datetime import datetime
from argparse import ArgumentParser
from base64 import b64decode
from collections import Counter
from graphityOut import toNeo, fromNeo, printGraph, printGraphInfo, dumpGraphInfoCsv, toPickle, fromPickle
from graphityViz import graphvizPlot, dumpJsonForJit, dumpGml, dumpGmlSubgraph, dumpJsonForD3
from graphityUtils import gimmeDatApiName, sha1hash, getAllAttributes, is_ascii, Hvalue, check_pe_header
from graphityOps import patternScan
import graphityFunc
# Works, takes its time, sometimes assigns wrong names to functions
# DEPRECATED
def loadFlirts():
try:
# load FLIRT signatures from local flirt directory
flirtDir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'flirt')
sigFiles = [f for f in os.listdir(flirtDir) if os.path.isfile(os.path.join(flirtDir, f))]
for sigFile in sigFiles:
r2cmd = "zfs %s" % os.path.join(flirtDir, sigFile)
R2PY.cmd(r2cmd)
except Exception as e:
print(str(e) + " FAIL loading FLIRT sig file")
# Too slow for now, waiting for fix
def loadZigs():
try:
# load directory of zigs
print('Loading msvcrt.sdb {:%Y-%m-%d %H:%M:%S}'.format(datetime.now()))
zigpath = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'signatures')
zigfile = os.path.join(zigpath, 'msvcrt.sdb')
r2cmd = "zo %s" % zigfile
# TODO load all signatures
R2PY.cmd(r2cmd)
print('msvcrt.sdb loaded {:%Y-%m-%d %H:%M:%S}'.format(datetime.now()))
R2PY.cmd("e search.in = io.sections.exec")
# e search.in = raw --- ?
#toScan = getCodeSections()
#for section in toScan:
r2cmd = "z/" #%d %d" % (section[0], section[1])
R2PY.cmd(r2cmd)
print('msvcrt.zig scan on code section(s) finished {:%Y-%m-%d %H:%M:%S}'.format(datetime.now()))
except Exception as e:
print(str(e))
def flagLibraryCode(graphity):
signList = R2PY.cmd("fs sign; fj")
#print (signList)
if signList:
signListJ = json.loads(signList)
for item in signListJ:
libfunction = hex(item['offset'])
if libfunction in graphity:
print (graphity.node[libfunction])
# Checks whether an address is located in an executable section
def isValidCode(callAddress, sectionsList):
# sectionsList contains executable sections as 2-element lists, containing start and end of each section
for execSection in sectionsList:
if int(callAddress, 16) >= execSection[0] and int(callAddress, 16) < execSection[1]:
return True
return False
# Returns a list of executable sections
def getCodeSections():
returnSections = []
# regular expression to pick out the executable section(s)
execSection = re.compile("perm=....x")
# will return the section table from radare2
sections = R2PY.cmd("iS")
sectionData = {}
for line in sections.splitlines():
if re.search(execSection, line):
for element in line.split():
items = element.split('=')
sectionData[items[0]] = items[1]
start = int(sectionData['vaddr'], 16)
end = start + int(sectionData['vsz'])
psize = int(sectionData['sz'])
returnSections.append([start, end, psize])
return returnSections
# Returns an executables imports as a list
def getIat():
iatlist = []
cmd = "iij"
iatjson = json.loads(R2PY.cmd(cmd))
for item in iatjson:
iatlist.append(hex(item['plt']))
return iatlist
# Returns a dictionary of xrefs to symbols
def crossRefScan():
cmd = "axtj @@ sym.*"
finalCalls = {}
# fixing the JSON... issue reported to radare2, keep in mind to remove workaround
temp = R2PY.cmd(cmd).replace('\n', ',')
temp = temp.replace(",,", ",")
temp = "[" + temp + "]"
# print(temp)
xrefj = json.loads(temp)
# TODO check!!
for xrefitem in xrefj:
for xreflevel2 in xrefitem:
# not data xref means its code or call
if xreflevel2['type'] != 'd':
finalCalls[hex(xreflevel2['from'])] = xreflevel2['opcode']
pass
# data potentially means API referenced by register; please note these are rather uncommon in the long list of symbol refs
# thus, bottelneck in parsing speed lies in number of refs
if xreflevel2['type'] == 'd' and ( xreflevel2['opcode'].startswith('mov') or xreflevel2['opcode'].startswith('lea') ):
# 'grepping' out the register from mov/lea operation
register = xreflevel2['opcode'].split()[1].replace(',','')
# disassemble downwards; mmmaybe smarter to disassemble until end of function, but possible that there is no function at all
# TODO find end of function, just in case
cmd = "pd 300 @ " + hex(xreflevel2['from'])
moreDisasm = R2PY.cmd(cmd)
# possible branches towards target
realCall = "call %s" % register
aJmp = "jmp %s" % register
for disasmLine in moreDisasm.splitlines()[1:]:
if realCall in disasmLine or aJmp in disasmLine:
#found a call!!
temp = disasmLine + ";" + xreflevel2['opcode'].split(',')[1].rstrip()
tempSplit = temp.split()
finalCalls[hex(int(tempSplit[0], 16))] = ' '.join(tempSplit[1:])
elif register in disasmLine:
# TODO if mov dword abc, reg is found -> follow abc?
# TODO could be parsed in more detail, e.g. mov dword, reg won't change the reg
#print disasmLine
break
#pass
return finalCalls
# Parses the binary for strings and their references to nodes
def stringScan(debugDict):
# Workflow is: get string, get xrefs to string if any, get functions of xrefs if any; fit node in graph with the string
allMyStrings = []
return allMyStrings
# izzj parses entire binary
stringCmd = "izzj"
strings = R2PY.cmd(stringCmd)
parsedStrings = json.loads(strings)
debugDict['stringsDangling'] = []
debugDict['stringsNoRef'] = []
i = 0
j = 1
while i < len(parsedStrings["strings"]):
stringItem = parsedStrings["strings"][i]
# Strings when retrieved through izzj command are BASE64 encoded
thatOneString = b64decode(stringItem['string']).replace(b'\\', b' \\\\ ')
thatOneString.replace(b'\'', b'')
try:
thatOneString = thatOneString.decode()
xrefCmd = "axtj @ " + hex(stringItem['vaddr'])
stringXrefsJ = R2PY.cmd(xrefCmd)
# RN
stringXrefsJ = stringXrefsJ.replace("\"\"", "\"")
# print(stringXrefsJ)
# TODO this should be a list, but is returned as a string now?
#if stringXrefsJ != []:
if len(stringXrefsJ) > 2:
stringXrefs = json.loads(stringXrefsJ)
# check whether string item is root of list of strings
j = 1
lastItem = stringItem
while (i + j) < len(parsedStrings["strings"]):
nextStringItem = parsedStrings["strings"][i + j]
lastAddr = lastItem['vaddr']
lastSize = lastItem['size']
# string offsets are 4 byte aligned, TODO check whether this is always the case
padding = 4 - (lastSize % 4)
if padding == 4:
padding = 0
nextAddr = lastAddr + lastSize + padding
if nextAddr != nextStringItem['vaddr'] or hasXref(hex(nextStringItem['vaddr'])):
# end.. exit here
break
else:
thatOneString = thatOneString + "|" + b64decode(nextStringItem['string']).decode()
j = j + 1
lastItem = nextStringItem
# iterate refs on string, if any
for ref in stringXrefs:
# sort out strings with code ref, i.e. non-strings
if ref['type'] != 'c' and ref['type'] != 'C':
stringAddr = hex(ref['from'])
stringFuncRef = gimmeRespectiveFunction(stringAddr)
if stringFuncRef != '0x0':
allMyStrings.append([stringAddr, stringFuncRef, thatOneString])
else:
# TODO this is merely still useful strings, see how to fit them in the graphs and db
# RN print("DANGLING STRING NO FUNCREF %s %s" % (stringAddr, thatOneString))
debugDict['stringsDangling'].append(thatOneString)
else:
debugDict['stringsNoRef'].append(thatOneString)
except UnicodeDecodeError:
pass
if j > 1:
i = i + j
else:
i = i + 1
debugDict['stringsDanglingTotal'] = len(debugDict['stringsDangling'])
debugDict['stringsNoRefTotal'] = len(debugDict['stringsNoRef'])
return allMyStrings
# Text whether xrefs exist for given address
def hasXref(vaddr):
refs = R2PY.cmd("axtj @ " + vaddr)
if refs:
return True
else:
return False
# Creating the NetworkX graph, nodes are functions, edges are calls or callbacks
def createRawGraph():
graphity = nx.DiGraph()
debugDict = {}
functions = R2PY.cmd("aflj")
if functions:
functionList=json.loads(functions)
#print json.dumps(functionList, indent=4, sort_keys=True)
else:
functionList = []
# figuring out code section size total
sectionsList = getCodeSections()
xlen = 0
for execSec in sectionsList:
xlen = xlen + execSec[2]
debugDict['xsectionsize'] = xlen
# CREATING THE GRAPH
refsGlobalVar = 0
refsUnrecognized = 0
refsFunc = 0
debugDict['functions'] = len(functionList)
### NetworkX Graph Structure ###
# FUNCTION as node, attributes: function address, size, calltype, list of calls, list of strings, count of calls; functiontype[Callback, Export], alias (e.g. export name), mnemonic distribution
# FUNCTIoN REFERENCE as edge (function address -> target address), attributes: ref offset (at)
# INDIRECT REFERENCE as edge (currently for threads and Windows hooks, also indirect code and indirect data references)
# API CALLS (list attribute of function node): address, API name
# STRINGS (list attribute of function node): address, string, evaluation
####
# TODO add count of refs from A to B as weights to edges
# TODO count calls to global vars, to indirect targets
for item in functionList:
#print hex(item['offset'])
graphity.add_node(hex(item['offset']), size=item['realsz'], calltype=item['calltype'], calls=[], apicallcount=0, strings=[], stringcount=0, functiontype='')
for item in functionList:
# TODO look into new values provided by aflj
# print(item)
if 'callrefs' in item:
for xref in item['callrefs']:
if xref['type'] == 'C':
# If an edge is added, that includes a non-existent node, the node will be added, but w/o the necessary attributes
# Thasss why we iterate twice, can theoretically be speeded up but needs testing
if hex(xref['addr']) in graphity:
if item['offset'] != xref['addr']:
graphity.add_edge(hex(item['offset']), hex(xref['addr']), pos=hex(xref['at']))
refsFunc = refsFunc + 1
elif hex(xref['addr']) in getIat():
pass
elif not isValidCode(hex(xref['addr']), sectionsList):
# TODO do something
print("DANGLING call to address outside code section, glob var, dynamic API loading %s -> %s" % (hex(item['offset']), hex(xref['addr'])))
refsGlobalVar = refsGlobalVar + 1
else:
print("FAIL: Call to code thats not a function, an import/symbol or otherwise recognized. Missed function perhaps. %s -> %s" % (hex(item['offset']), hex(xref['addr'])))
refsUnrecognized = refsUnrecognized + 1
print('* %s Graph created with NetworkX ' % str(datetime.now()))
debugDict['refsFunctions'] = refsFunc
debugDict['refsGlobalVar'] = refsGlobalVar
debugDict['refsUnrecognized'] = refsUnrecognized
apiRefs = crossRefScan()
callNum = len(apiRefs)
missesNum = 0
# FITTING GRAPH WITH API REFS
for call in apiRefs:
# get the address of the function, that contains the call to a given symbol
funcAddress = gimmeRespectiveFunction(call)
# TODO check if funcAddress is the real function address
if funcAddress in graphity:
# node(funcAddress) has attribute calls, which contains a list of API calls
api = gimmeDatApiName(apiRefs[call])
graphity.node[funcAddress]['calls'].append([call, api])
# detected API call reference does not resolve to a function offset, insert handling for this here
else:
print("DANGLING API CALL %s %s" % (call, apiRefs[call]))
missesNum = missesNum+1
# debug: print total API refs and functionless API refs, maybe indicator for obfuscated code
print('* %s Graph extended with API calls, %d calls in total, %d dangling w/o function reference ' % (str(datetime.now()), callNum, missesNum))
debugDict['apiTotal'] = callNum
debugDict['apiMisses'] = missesNum
# FITTING GRAPH WITH STRING REFS
allTheStrings = stringScan(debugDict)
stringrefs = 0
for aString in allTheStrings:
stringAddr = aString[0]
stringFunc = aString[1]
stringData = aString[2]
# add string to respective function node in graph
if stringFunc in graphity:
graphity.node[stringFunc]['strings'].append([stringAddr, stringData])
stringrefs = stringrefs + 1
else:
print("\n*** BIG FAIL *** String's function not in graph %s %s" % (stringFunc, stringData))
print('* %s Graph extended with string references ' % (str(datetime.now())))
debugDict['stringsReferencedTotal'] = stringrefs
return graphity, debugDict
# Tag exports of DLLs
# TODO : check whether exports are coming back after bugfix (?)
def analyzeExports(graphity):
exportsj = json.loads(R2PY.cmd("iEj"))
for item in exportsj:
exportAddress = hex(item['vaddr'])
exportName = item['name']
exportFunction = gimmeRespectiveFunction(exportAddress)
if exportFunction in graphity:
graphity.node[exportFunction]['functiontype'] = 'Export'
graphity.node[exportFunction]['alias'] = exportName
# Removing thunks as they make my graphs fat, replace by API calls
def thunkPruning(graphity):
for aNode in graphity.nodes(data=True):
# most obvious thunks, other thunks exist too, len seen was 11, 13
# TODO !!!!!!!! check for 64bit
# TODO check with radare for thunk detection?
# funclets that contain nothing but a jump to an import, and do not call other functions
if len(aNode[1]['calls']) == 1 and aNode[1]['size'] == 6 and not graphity.successors(aNode[0]):
thunk = aNode[0]
thunkApi = aNode[1]['calls'][0]
# need to go on with radare from here, cause graphity doesn't know all the addressed of the xrefs to thunks from within a function
# getting all xrefs on thunk, then getting function its located in to get to node of graph
temp = R2PY.cmd("axtj " + thunk)
thunkRefs = []
if temp:
thunkRefs = json.loads(temp)
for aRef in thunkRefs:
thunkCallAddr = hex(aRef['from'])
thunkFuncRef = gimmeRespectiveFunction(thunkCallAddr)
# if thunk's xrefs include a detected function then add thunk as a regular API call to calls list of respective node
if thunkFuncRef != '0x0':
graphity.node[thunkFuncRef]['calls'].append([thunkCallAddr, thunkApi[1]])
# after xref to thunk has been added to all calling functions, remove thunk node from graph
graphity.remove_node(thunk)
# Adding edges to indirectly referenced functions, thread handlers and hook functions for now only
def tagCallbacks(graphity):
for aNode in graphity.nodes(data=True):
for call in aNode[1]['calls']:
xrefTarget = ''
# TODO consider this bad practise, do something smarter, not sure yet what, consider _beginthread API etc. etc.
# also, maybe this is fixed in radare later, so consider this code redundant by then
if 'CreateThread' in call[1]:
xrefTarget = getCallback(call[0], 3)
if 'SetWindowsHookEx' in call[1]:
xrefTarget = getCallback(call[0], 2)
if xrefTarget:
print (xrefTarget, aNode[0])
addIndirectEdge(graphity, aNode[0], xrefTarget, "apicallback", "Callback")
# implicitly filters out callbacks fixed already - gets all nodes with zero in-degre
# TODO see if feasible for all functions, even with such already having in edges
for aNode in graphity.nodes(data=True):
if graphity.in_degree(aNode[0]) == 0:
jay = R2PY.cmd("axtj @ " + aNode[0])
if jay:
xrefs = json.loads(jay)
for xref in xrefs:
# if xref is code its almost certainly an edge to add
if xref['type'] == 'c':
# TODO circle back on jumptable-as-a-function bug from r2
# really ugly workaround, really really ugly..
if not 'dword [' in xref['opcode']:
addIndirectEdge(graphity, hex(xref['from']), aNode[0], "coderef", "IndirectCode")
# if xref is data
if xref['type'] == 'd':
opcd = xref['opcode']
# TODO run more tests on this list not sure these are all possible cases
# TODO make datarefs optional!
if opcd.startswith('push') or opcd.startswith('lea') or opcd.startswith('mov'):
print (hex(xref['from']), opcd)
addIndirectEdge(graphity, hex(xref['from']), aNode[0], "dataref", "IndirectData")
else:
# TODO look into add reg, ThreadRoutine -> as xref
print ("up for discussion: " + hex(xref['from']), xref['type'], xref['opcode'])
def addIndirectEdge(graphity, fromAddr, toAddr, calltype, functiontype):
fromNode = gimmeRespectiveFunction(fromAddr)
toNode = gimmeRespectiveFunction(toAddr)
if fromNode in graphity and toNode in graphity:
graphity.node[toNode]['functiontype'] = functiontype
graphity.add_edge(fromNode, toNode, calltype=calltype)
print ("added callback edge", fromNode, toNode, calltype, "\n")
else:
print ("Something went wrong with indirect edge ", fromAddr, toAddr, calltype)
# Parsing the handler offset out of the function arguments
def getCallback(call, argcount):
# simplistic: walk up the code until xref to code is found, works as long as API only receives one code ref, works well with Windows APIs
disasmMore = "pd -30 @" + call
upwards = R2PY.cmd(disasmMore)
for otherLine in reversed(upwards.splitlines()):
if 'push' in otherLine:
argcount = argcount - 1
# TODO better done with a regex, bug prone
if not argcount:
address = otherLine.split("push",1)[1].split()[0]
if 'fcn.' in address:
return hex(int(address.split('.')[1], 16))
if '0x' in address:
return hex(int(address.split('0x')[1], 16))
else:
return ''
# WORKAROUND until function detection - bug? feature? in radare is fixed and export vaddr equal actual offsets again
def gimmeRespectiveFunction(address):
if address:
return R2PY.cmd("?v $FB @ " + address)
return ''
def mnemonicism(offset):
mnems = []
fsize = 0
weight = 0
funcdump = R2PY.cmd("pdfj @ " + offset)
if funcdump:
dumpj = json.loads(funcdump)
for item in dumpj["ops"]:
#print(item)
if "type" in item:
mnems.append(item["type"])
#print (item["type"], item["opcode"])
fsize = dumpj["size"]
#print ("\n" + offset + " " + str(fsize))
mnemdict = Counter(mnems)
#for mnem in sorted(mnemdict):
# print (mnem, mnemdict[mnem])
for mnem in mnemdict:
if mnem in ['shl', 'shr', 'mul', 'div', 'rol', 'ror', 'sar', 'load', 'store']:
weight += mnemdict[mnem]
return (weight * 10) / fsize
# TODO count how many above certain threshold, see how close they are together in the graph?
# super graph creation function, radare-analyses the sample, puts together all of the graph and debug info
def graphMagix(filepath, allAtts, deactivatecache):
global R2PY
if (os.path.isfile("cache/" + allAtts['sha1'] + ".txt") and os.path.isfile("cache/" + allAtts['sha1'] + ".dbg") and deactivatecache == False):
print('* %s Loading graph from cache under ./cache/[sha1].txt or .dbg' % str(datetime.now()))
graphity, debug = fromPickle(allAtts['sha1'])
else:
print('* %s R2 started analysis ' % str(datetime.now()))
BENCH['r2_start'] = time()
print("filepath:" + filepath)
R2PY = r2pipe.open(filepath)
R2PY.cmd("e asm.lines = false")
R2PY.cmd("e asm.fcnlines = false")
R2PY.cmd("e anal.autoname= false")
R2PY.cmd("e anal.jmptbl = true")
R2PY.cmd("e anal.hasnext = true")
R2PY.cmd("e anal.bb.maxsize = 1M")
#R2PY.cmd("e src.null = true")
R2PY.cmd("aaa")
#R2PY.cmd("afr")
#R2PY.cmd("afr @@ sym*")
#loadZigs()
#loadFlirts()
BENCH['r2_end'] = time()
print('* %s R2 finished analysis' % str(datetime.now()))
# GRAPH CREATION
graphity, debug = createRawGraph()
# TODO testing lib code detected
#flagLibraryCode(graphity)
# DLL PROCESSING
if 'DLL' in allAtts['filetype']:
analyzeExports(graphity)
# Thunk pruning, thunks are unnecessary information in the graph
thunkPruning(graphity)
# handler tagging
tagCallbacks(graphity)
# update api and string count attributes
for aNode in graphity.nodes(data=True):
aNode[1]['apicallcount'] = len(aNode[1]['calls'])
aNode[1]['stringcount'] = len(aNode[1]['strings'])
# calc mnemonic dist
for aNode in graphity.nodes():
graphity.node[aNode]['mnemonicism'] = mnemonicism(aNode)
BENCH['graph_end'] = time()
# graph and debug info caching to save parsing time, potentially
if (deactivatecache == False):
toPickle(graphity, debug, allAtts['sha1'])
return graphity, debug
'''
#global R2PY
for entry in behaviours:
info = behaviours[entry]
for api_info in info:
for api in api_info:
print(api, api_info[api])
addr = "s." + str(api_info[api])
R2PY.cmd(addr)
print(R2PY.cmd("pdf"))
'''
def get_behaviors(filepath, dst_file):
global BENCH
BENCH = {}
behaviours = {}
if check_pe_header(filepath):
print('* %s Parsing %s ' % (str(datetime.now()), filepath))
allAtts = getAllAttributes(filepath)
graphity, debug = graphMagix(filepath, allAtts, True) # args.deactivatecache)
# BEHAVIOR
print('* %s Scanning for API patterns ' % str(datetime.now()))
BENCH['behavior_start'] = time()
allThePatterns = graphityFunc.funcDict
for patty in allThePatterns:
# print(patty)
findings = patternScan(graphity, allThePatterns[patty])
for hit in findings:
if not False in hit['patterns'].values():
print("For %s found %s" % (patty, str(hit['patterns'])))
if patty in behaviours:
list_hit = behaviours[patty]
list_hit.append(hit['patterns'])
behaviours[patty] = list_hit
else:
behaviours[patty] = [hit['patterns']]
BENCH['behavior_end'] = time()
ret_info = {}
if behaviours:
function_list = {}
for behav in behaviours:
info = behaviours[behav]
for entry in info:
for name in entry:
if not str(entry[name]) in function_list:
function_list[str(entry[name])] = behav
print(entry)
base_file = dst_file.replace(".behav.json", "")
for funct in function_list:
R2PY.cmd("s." + funct)
pseudo_code = R2PY.cmd("pdc")
code_file = base_file + "." + function_list[funct] + "_" + funct + ".c"
with open(code_file, "w") as out:
for line in pseudo_code.split("\n"):
line = line.rstrip()
if line:
out.write(line + "\n")
ret_info["Suspicious Behaviors"] = behaviours
with open(dst_file, "w") as out:
out.write(json.dumps(ret_info, sort_keys=True, indent=4))
return ret_info
if __name__ == '__main__':
#global R2PY
global BENCH
BENCH = {}
parser = ArgumentParser()
parser.add_argument("input", help="Tool requires an input file or directory; directory, i.e. batch processing, only possible and feasible for csvdump option")
parser.add_argument("-d", "--deactivatecache", action="store_true", help="Deactivate caching of graphs, for debugging of graph generation")
# Text output options
parser.add_argument("-p", "--printing", action="store_true", help="Print the graph as text, as in, nodes with respective content")
parser.add_argument("-i", "--info", action="store_true", help="Print info and stats of the graph")
parser.add_argument("-b", "--behavior", action="store_true", help="Scan for behaviors listed in graphityFunc.py")
# Visualization & viz data options
parser.add_argument("-l", "--plotting", action="store_true", help="Plotting the graph via pyplot")
parser.add_argument("-g", "--gml", action="store_true", help="Spit out GML data for Gephi and what not")
parser.add_argument("-s", "--gmlsub", help="Define an offset in the form e.g. 0x401000 to dump the subgraph starting there")
parser.add_argument("-j", "--jit", action="store_true", help="Spits out JSON data, ready to be visualized within JS InfoVis as force directed graph")
# Batch processing options
parser.add_argument("-n", "--neodump", action="store_true", help="Dump graph to Neo4j (configured to flush previous data from Neo, might wanna change that) - BATCH PROCESSING ONLY")
parser.add_argument("-c", "--csvdump", help="Dump info data to a given csv file, appends a line per sample, for testing now also dumps strings per binary in dedicated csv file - BATCH PROCESSING ONLY")
args = parser.parse_args()
# TODO check the path pythonically
# Batch processing options: csvdump, neodump, TBC
if args.input and os.path.isdir(args.input):
for (dirpath, dirnames, filenames) in os.walk(args.input):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
if check_pe_header(filepath):
print('* %s Parsing %s ' % (str(datetime.now()), filename))
allAtts = getAllAttributes(filepath)
graphity, debug = graphMagix(filepath, allAtts, args.deactivatecache)
if args.csvdump:
# CSVDUMP
dumpGraphInfoCsv(graphity, debug, allAtts, args.csvdump)
print('* %s Dumping graph info to indicated csv file ' % str(datetime.now()))
if args.neodump:
# TO NEO STUFF
toNeo(graphity, allAtts)
print('* %s Dumped to Neo4J ' % str(datetime.now()))
elif args.input and check_pe_header(args.input):
# ATTRIBUTES: md5, sha1, filename, filetype, ssdeep, filesize, imphash, compilationts, addressep, sectionep,
# sectioncount, sectioninfo, tlssections, originalfilename
allAtts = getAllAttributes(args.input)
graphity, debug = graphMagix(args.input, allAtts, args.deactivatecache)
# TODO decide what to do with dangling strings/APIs (string filtering with frequency analysis?)
if args.printing:
# PRINT GRAPH TO CMDLINE
print("* %s Printing the graph - nodes and node attributes" % str(datetime.now()))
BENCH['printing_start'] = time()
printGraph(graphity)
BENCH['printing_end'] = time()
if args.info:
# PRINT GRAPH INFO
BENCH['info_start'] = time()
printGraphInfo(graphity, debug)
BENCH['info_end'] = time()
# TODO look into certificate info: iC
if args.plotting:
# GRAPH PLOTTING STUFF
#try:
print('* %s Plotting routine starting ' % str(datetime.now()))
BENCH['plotting_start'] = time()
graphvizPlot(graphity, allAtts)
BENCH['plotting_end'] = time()
print('* %s Plotting routine finished ' % str(datetime.now()))
#except:
# print '* %s Cant plot this with pydot, too big ' % str(datetime.now())
if args.neodump:
# TO NEO STUFF
BENCH['neo_start'] = time()
toNeo(graphity, allAtts)
BENCH['neo_end'] = time()
print('* %s Dumped to Neo4J ' % str(datetime.now()))
if args.behavior:
# BEHAVIOR
# TODO enable switching of behavior dictionaries
print('* %s Scanning for API patterns ' % str(datetime.now()))
BENCH['behavior_start'] = time()
allThePatterns = graphityFunc.funcDict
behaviours = {}
for patty in allThePatterns:
# print(patty)
findings = patternScan(graphity, allThePatterns[patty])
for hit in findings:
if not False in hit['patterns'].values():
print("For %s found %s" % (patty, str(hit['patterns'])))
if patty in behaviours:
list_hit = behaviours[patty]
list_hit.append(hit['patterns'])
behaviours[patty] = list_hit
else:
behaviours[patty] = [hit['patterns']]
out_file = args.input + ".behavior.json"
save_behaviors(out_file, behaviours)
BENCH['behavior_end'] = time()
if args.gml:
# GML and stuff
BENCH['gml_start'] = time()
dumpGml(graphity, allAtts)
BENCH['gml_end'] = time()
if args.gmlsub:
# TODO add bench
dumpGmlSubgraph(graphity, gmlsub)
if args.jit:
#dumpJsonForJit(graphity, indent=2)
BENCH['d3_start'] = time()
dumpJsonForD3(graphity)
BENCH['d3_end'] = time()
# TODO calculate dispersion for 2-n anchor addresses
# TODO handling of LoadLib/GetPAddr. for "hiding something" question, follow GetProc return value
print('* %s Stuffs all finished ' % str(datetime.now()))
# TIME
print("\n__..--*** I WANNA BE A BENCHMARK WHEN I GROW UP ***--..__")
if 'r2_start' in BENCH:
print("__ %5f R2 Analysis" % (BENCH['r2_end'] - BENCH['r2_start']))
if 'graph_end' in BENCH:
print("__ %5f Graph construction" % (BENCH['graph_end'] - BENCH['r2_end']))
if 'printing_start' in BENCH:
print("__ %5f Printing" % (BENCH['printing_end'] - BENCH['printing_start']))
if 'info_start' in BENCH:
print("__ %5f Info" % (BENCH['info_end'] - BENCH['info_start']))
if 'plotting_start' in BENCH:
print("__ %5f Plotting" % (BENCH['plotting_end'] - BENCH['plotting_start']))
if 'behavior_start' in BENCH:
print("__ %5f Behavior" % (BENCH['behavior_end'] - BENCH['behavior_start']))
if 'neo_start' in BENCH:
print("__ %5f Neo4j" % (BENCH['neo_end'] - BENCH['neo_start']))
if 'csv_start' in BENCH:
print("__ %5f CSV dump" % (BENCH['csv_end'] - BENCH['csv_start']))
if 'gml_start' in BENCH:
print("__ %5f GML dump" % (BENCH['gml_end'] - BENCH['gml_start']))
if 'd3_start' in BENCH:
print("__ %5f D3 dump" % (BENCH['d3_end'] - BENCH['d3_start']))
else:
print("Potentially not a PE file %s" % args.input)