diff --git a/parser.py b/parser.py index 8577788..7297e38 100644 --- a/parser.py +++ b/parser.py @@ -48,31 +48,33 @@ switch_addrs = set(switch_addrs) ## Parse IDA HTML ## -# See if current function is referenced by the inputted comparison address -def compare_xref_addr(line, compare_addr): - # Get the address of the referencing function - xref_idx = line.find('CODE XREF: sub_') - # If there is not a referencing function or it is in a different file, this doesn't need to be verified - if xref_idx == -1: - return True - else: - xref = line[xref_idx+15:xref_idx+23] - - # Check equality between XREF address and comparison address - return xref == compare_addr - # Initialize list to store start and end of functions functs = [] # Count how many functions have been added num_functs = 0 +# Function for adding to function list and incrementing count +def add_function(new_start_addr, prev_end_addr): + global num_functs + # If an end address for the last added function was specified + if prev_end_addr != None: + # Set end address for last added function + functs[num_functs-1][1] = prev_end_addr + # Add a new function to the list with the specified starting address + functs.append([new_start_addr, 0, []]) + # Increment the number of functions + num_functs = num_functs+1 + # Mark if we are in .text section in_text = False # Mark if we should end parsing end_parse = False +# Initialize address of last blr instruction to 0 +blr_addr = '00000000' + # Initialize address of last padding to 0 pad_addr = '00000000' @@ -89,58 +91,54 @@ with open(ida_html, 'r') as file: curr_addr = line[colon_idx+1:colon_idx+9] # Check if this is the start of a function - if re.search('^\.text:'+curr_addr+' sub_'+curr_addr, line): - # Check if this is a new function and not part of a switch + if re.search('^\.text:'+curr_addr+' sub_'+curr_addr+': *# [A-Z][A-Z][A-Z][A-Z] XREF:.*', line): + # Save current address as integer + curr_addr_int = int(curr_addr, 16) + if num_functs > 0: - # If the referencing function is not the last added function, then it is not part of a switch - if not compare_xref_addr(line, functs[num_functs-1][0]): - # Add this address as a new function - functs.append([curr_addr, 0]) - num_functs = num_functs+1 - # Convert addresses to integer for comparison - curr_addr_int = int(curr_addr, 16) - pad_addr_int = int(pad_addr, 16) - # If previous address was padding, end last function at the padding - if curr_addr_int-4 == pad_addr_int: - functs[num_functs-2][1] = pad_addr_int - # Else, end last function as this address - else: - functs[num_functs-2][1] = curr_addr_int - - # If this is the first function to be added, don't need to check if it is part of a switch + # If last address had padding, then this function was already added + if not curr_addr_int-4 == int(pad_addr, 16): + # Check if this function is part of latest added function + is_nested_funct = False + nested_functs = functs[num_functs-1][2] + for nested_funct in nested_functs: + is_nested_funct = nested_funct==curr_addr + + # If last address was not padding and not nested in latest function + if not is_nested_funct: + # If this is not the first function being added + if num_functs > 0: + # Add new function and last function's end address + add_function(curr_addr_int, curr_addr_int) else: - # Add this address as a new function - functs.append([curr_addr, 0]) - num_functs = num_functs+1 + # Add new function + add_function(curr_addr_int, None) - # If this is not the start of a function - else: - # Check if it is a nested loc_ or def_ - if re.search('^\.text:'+curr_addr+' [ld][oe][cf]_'+curr_addr, line): - # If the referencing function is not the last added function, then it is not part of a switch - if not compare_xref_addr(line, functs[num_functs-1][0]): - # Add this address as a new function - functs.append([curr_addr, 0]) - num_functs = num_functs+1 - # Convert addresses to integer for comparison - curr_addr_int = int(curr_addr, 16) - pad_addr_int = int(pad_addr, 16) - # If previous address was padding, end last function at the padding - if curr_addr_int-4 == pad_addr_int: - functs[num_functs-2][1] = pad_addr_int - # End the last function at the previous address - else: - functs[num_functs-2][1] = curr_addr_int - - # Check if this line is padding - elif re.search('\.long 0$', line): - # Save address of most recently found padding - pad_addr = curr_addr + # If this is a location + elif re.search('^\.text:'+curr_addr+' loc_'+curr_addr, line): + curr_addr_int = int(curr_addr, 16) + # If previous address was a blr instruction + if curr_addr_int-4 == blr_addr: + print(curr_addr) + add_function(curr_addr_int, curr_addr_int) + # If not, store as nested function in latest function + else: + # Find address of function that references this + xref_idx = line.find('XREF: sub_') + # If it was found + if xref_idx > -1: + # Store as nested function in latest function + functs[num_functs-1][2].append(line[xref_idx+10:xref_idx+18]) - # Check if we are still in .text - elif re.search('\.text:', line) == None: - # If not, end parsing - end_parse = True + # Check if this line is padding + elif num_functs > 0 and re.search('\.long 0$', line): + curr_addr_int = int(curr_addr, 16) + # Add a new function at the line after padding, and end the current function at this padding address + add_function(curr_addr_int+4, curr_addr_int) + + # Check for blr instruction + elif re.search('blr', line): + blr_addr = curr_addr # If not in .text else: @@ -171,13 +169,13 @@ for switch_addr in switch_addrs: while(search_for_funct): curr_funct = functs[curr_funct_idx] # If switch address is after this function's start - curr_funct_start = int(curr_funct[0], 16) + curr_funct_start = curr_funct[0] if(switch_addr_int > curr_funct_start): # If switch address is before this function's end curr_funct_end = curr_funct[1] if(switch_addr_int <= curr_funct_end): # Save current function's start address and the function's length - output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start)]) + output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start), switch_addr]) # Don't need to continue search for this switch statement search_for_funct = False @@ -202,7 +200,7 @@ for funct in output_functs: curr_funct_end = '0x'+funct[1][2:].upper() # Format function - curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end+" }," + curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end+" src = "+funct[2]+" }," # Add to complete output string output_str = output_str+curr_funct