Let's write 32-bit shellcode in x86!
For a long time, I've been intrigued by computers; that's why I enjoy working with low-level languages such as C++. But I came to the realization that I am actually just still programming with interfaces such as the Win32 API.
So, I began looking at how LoadLibraryA works internally or how a program resolves library exports from them. This is where I stumbled eventually upon the concept of position-independent code (PIC). What this means is it's code that can execute from memory regardless of its current position. It comes with restrictions such as not being able to use dynamic linking or dynamic addressing.
Chapter 1: The Process Environment Block
When we write position-independent code (PIC), we cannot use dynamic addressing, linking, or such conveniences.
So resolving functions such as LoadLibraryA, MessageBoxA and ShellExecuteA becomes impossible, or does it? Lucky for us. The process environment block contains a Ldr (loaderData) section that imports system libraries, i.e., NTDLL.dll and KERNEL32.dll.
We can parse the PE header manually to resolve function exports from these system libraries to use the Win32 API. We can access this from the FS segment register at 0x30 on x86 Win32:
mov eax, fs: [0x30]Now that we have the PEB structure pointer, let's take a look at the PEB struct so we know where to access the Ldr member.
typedef struct _PEB {
BYTE Reserved1[2];
BYTE BeingDebugged;
BYTE Reserved2[1];
PVOID Reserved3[2];
PPEB_LDR_DATA Ldr;
PRTL_USER_PROCESS_PARAMETERS ProcessParameters;
PVOID Reserved4[3];
PVOID AtlThunkSListPtr;
PVOID Reserved5;
ULONG Reserved6;
PVOID Reserved7;
ULONG Reserved8;
ULONG AtlThunkSListPtr32;
PVOID Reserved9[45];
BYTE Reserved10[96];
PPS_POST_PROCESS_INIT_ROUTINE PostProcessInitRoutine;
BYTE Reserved11[128];
PVOID Reserved12[1];
ULONG SessionId;
} PEB, *PPEB;As we can see, the Ldr is the fifth member of the PEB structure. Since guessing the offset is stupid, we can use a bit of C to figure out the offset:
printf("PEB->Ldr: %02x\n", offsetof(PEB, Ldr));The result is 0c.
mov eax, fs: [0x30] ; Process Information Block
mov eax, [eax + 0xC] ; PEB->LdrSo we can access the Ldr by dereferencing eax with the offset of 0xC.
typedef struct _PEB_LDR_DATA {
BYTE Reserved1[8];
PVOID Reserved2[3];
LIST_ENTRY InMemoryOrderModuleList;
} PEB_LDR_DATA, *PPEB_LDR_DATA;Next we do the same for InMemoryOrderModuleList in PEB_LDR_DATA, once we know Ldr is 0xC and InMemoryOrderModuleList is 0x14. We can load the effective address of InMemoryOrderModuleList and iterate through it.
mov eax, fs: [0x30] ; Process Information Block
mov eax, [eax + 0xC] ; PEB->Ldr
lea edi, [eax + 0x14] ; PEB->Ldr->InMemoryOrderModuleListThere is something to note: LDR_DATA_TABLE_ENTRY does not contain an entry where only the DLL name is listed officially; however, it seems that Reserved4 seems to contain a UNICODE_STRING that contains only the DLL name, i.e., NTDLL.DLL or KERNEL32.DLL.
typedef struct _LDR_DATA_TABLE_ENTRY {
PVOID Reserved1[2];
LIST_ENTRY InMemoryOrderLinks;
PVOID Reserved2[2];
PVOID DllBase;
PVOID Reserved3[2];
UNICODE_STRING FullDllName;
BYTE Reserved4[8];
PVOID Reserved5[3];
union
{
ULONG CheckSum;
PVOID Reserved6;
};
ULONG TimeDateStamp;
} LDR_DATA_TABLE_ENTRY, *PLDR_DATA_TABLE_ENTRY;So we use that instead and compare if the length of the UNICODE_STRING is 24 to find KERNEL32.DLL because there should never be two DLLs with the same length, and due to it being Unicode, the actual size of the string is multiplied by two, so if KERNEL32.DLL is 12 characters long, our comparison is 24.
mov esi, [edi] ; Flink
loop_til_kernel32_found:
cmp esi, edi ; Current != Head
je cleanup ; If loops ends before we find KERNEL32.dll, jump to cleanup
mov ebx, esi ; Copy esi to ebx
sub ebx, 0x8 ; Subtract InMemoryOrderLinks
lea ecx, [ebx + 0x2C] ; UNICODE_STRING Reserved4
movzx edx, word ptr [ecx] ; Copy Length from UNICODE_STRING
cmp edx, 24 ; Length of KERNEL32.DLL
jne not_kernel32
mov eax, [ebx + 0x18] ; DllBase
jmp found_kernel32
not_kernel32:
mov esi, [esi]
jmp loop_til_kernel32_foundNow that we have loaded the DllBase to we have to start parsing the PE header.
Chapter 3: PE Header & Data Directories
The PE header can be considered the "metadata" of a file; it tells us what it embeds, where the code is located, and whether it imports or exports libraries. By parsing this file, we can get access to other Win32 functionality through KERNEL32.dll.
Let's walk through the process of implementing GetProcAddr from scratch:
First we should get the PE header offset (e_lfanew) from the DOS header and read the NT header.
IMAGE_DOS_HEADER* dosHeader = reinterpret_cast<IMAGE_DOS_HEADER*>(modlib);
IMAGE_NT_HEADERS32* ntHeader = reinterpret_cast<IMAGE_NT_HEADERS*>(
(reinterpret_cast<BYTE*>(modlib) + dosHeader->e_lfanew)
);Then we should read the DataDirectory from OptionalHeader.
MAGE_OPTIONAL_HEADER32 optionalHeader ntHeader->OptionalHeader;
IMAGE_DATA_DIRECTORY dataDirectory = optionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_EXPORT];Take the relative virtual address from DataDirectory and convert it to a virtual address.
PIMAGE_EXPORT_DIRECTORY exportDirectory = reinterpret_cast<PIMAGE_EXPORT_DIRECTORY>((reinterpret_cast<BYTE*>(modlib) + dataDirectory.VirtualAddress));Now that we have a valid address exportDirectory we should loop through each function export and try to match LoadLibraryA.
DWORD* AddressOfNames = reinterpret_cast<DWORD*>(
(reinterpret_cast<BYTE*>(modlib) + exportDirectory->AddressOfNames)
);
DWORD* AddressOfFunctions = reinterpret_cast<DWORD*>(
(reinterpret_cast<BYTE*>(modlib) + exportDirectory->AddressOfFunctions)
);
WORD* AddressOfNameOrdinals = reinterpret_cast<WORD*>(
(reinterpret_cast<BYTE*>(modlib) + exportDirectory->AddressOfNameOrdinals)
);
for (int i = 0; i < exportDirectory->NumberOfNames; i++) {
const char* name = reinterpret_cast<const char*>(
reinterpret_cast<BYTE*>(modlib) + AddressOfNames[i]
);
if (strcmp(name, "LoadLibraryA") == 0) {
auto function = reinterpret_cast<void*>(
reinterpret_cast<BYTE*>(modlib) + AddressOfFunctions[AddressOfNameOrdinals[i]]
);
// ...
}
}Now that we have a reference of how this is implemented in C++, how do we implement this in assembly? Let's say that eax = DllBase.
mov ebx, [eax + 0x3C] ; IMAGE_DOS_HEADER e_lfanew
add ebx, eax ; IMAGE_NT_HEADERS32First we load the NT header to ebx by loading e_lfanew from the DOS header at offset 0x3C then we add eax to ensure ebx is pointing at the NT header.
mov edx, [ebx + 0x78] ; OptionalHeader->DataDirectory[0]
add edx, eax ; RVA -> VANext, load IMAGE_DIRECTORY_ENTRY_EXPORT from ebx which is pointing to the NT header. We can do this by knowing that OptionalHeader is at 0x60 and is within 0x18 in DataDirectory, then converting the relative address to absolute.
mov ebx, [edx + 0x18] ; NumberOfNames
dec ebxSince we're implementing a reverse loop, we move the number of exports to ebx and immediately decrement the register to ensure we don't get an out-of-bounds check.
mov esi, [edx + 0x20] ; AddressOfNames
add esi, eaxWe are loading the base pointer of AddressOfNames outside of the loop, so we don't have to recalculate it every everytime.
loop_til_loadlibrary_found:
test ebx, ebx
je cleanupCheck if ebx is zero; if it is, it means we did not find LoadLibraryA before AddressOfNames ran out, so just clean up.
mov edi, [esi + ebx * 4] ; AddressOfNames[ebx]
add edi, eax
mov ecx, [edi]Load the AddressOfNames based on the current index of ebx and convert the relative address to absolute.
cmp ecx, 0x64616F4C ; Load
jne not_loadlibrary
mov ecx, [edi + 4]
cmp ecx, 0x7262694C ; Libr
jne not_loadlibrary
mov ecx, [edi + 8]
cmp ecx, 0x41797261 ; aryA
jne not_loadlibrary
jmp loadlibrary_foundCompare DWORD in chunks to edi which holds the current export name; if all of them match, then we found our match; otherwise, continue the loop.
not_loadlibrary:
dec ebx
jmp loop_til_loadlibrary_foundDecrement the ebx register and jump back to the top of the loop, as this routine only triggers when a match was not found.
loadlibrary_found:
mov esi, [edx+0x24] ; AddressOfNameOrdinals
add esi, eax
movzx edi, word ptr [esi + ebx * 2]
mov esi, [edx+0x1C] ; AddressOfFunctions
add esi, eax
mov ecx, [esi + edi * 4]
add ecx, eaxLoad the current ordinal into the esi register and then use the ordinal to get the function pointer and store it to ecx And there it is; that's how we can manually implement functionality equivalent to GetProcAddr without having to use it.
Chapter 3: Calling Conventions
Now that we have an address to our function, we need to figure out how to invoke it. This can widely differ depending on the calling convention of the function; Win32 uses __stdcall.
When using __stdcall we don't have to clean the stack, so we only have to clean our own allocation:
sub esp, 12
/* ... */
lea ebx, [esp] ; shell32.dll
push ebx
call ecx
add esp, 12If this was __cdecl we would have to add +4 because the caller is responsible for clearing esp:
sub esp, 12
/* ... */
lea ebx, [esp] ; shell32.dll
push ebx
call ecx
add esp, 16Chapter 4: Pushing Stack
Let's say we have ShellExecuteA which has the following API definition:
HINSTANCE ShellExecuteA(
[in, optional] HWND hwnd,
[in, optional] LPCSTR lpOperation,
[in] LPCSTR lpFile,
[in, optional] LPCSTR lpParameters,
[in, optional] LPCSTR lpDirectory,
[in] INT nShowCmd
);To call it in C, we would write something like this.
ShellExecuteA(0, "open", "calc", 0, 0, 0);Now, you would assume when pushing to assembly we would do something like this:
push 0 ; hwnd
push eax ; lpOperation
push ebx ; lpFile
push 0 ; lpParameters
push 0 ; lpDirectory
push 0 ; nShowCmd
call ShellExecuteAFor the sake of example, let's assume eax = "open" and ebx = "calc" which both have been allocated on the stack. This code does not work, because in assembly the way to push a stack is actually reversed compared to the programmatic definition of the same function.
So the way it would be done is:
push 0 ; nShowCmd
push 0 ; lpDirectory
push 0 ; lpParameters
push ebx ; lpFile
push eax ; lpOperation
push 0 ; hwnd
call ShellExecuteAAnd this would invoke the function correctly.
Chapter 5: Putting it all together
Now that we have all the building blocks, let's assemble the final code to execute calc through ShellExecuteA from our shellcode:
mov eax, fs: [0x30] ; Process Information Block
mov eax, [eax + 0xC] ; PEB->Ldr
lea edi, [eax + 0x14] ; PEB->Ldr->InMemoryOrderModuleList
mov esi, [edi] ; Flink
loop_til_kernel32_found:
cmp esi, edi ; current != head
je cleanup
mov ebx, esi
sub ebx, 0x8 ; subtract InMemoryOrderLinks
lea ecx, [ebx + 0x2C] ; UNICODE_STRING BaseDllName
movzx edx, word ptr [ecx]
cmp edx, 24 ; KERNEL32.DLL
jne not_kernel32
mov eax, [ebx + 0x18] ; DllBase
jmp found_kernel32
not_kernel32:
mov esi, [esi]
jmp loop_til_kernel32_found
found_kernel32:
mov ebx, [eax + 0x3C] ; IMAGE_DOS_HEADER e_lfanew
add ebx, eax ; IMAGE_NT_HEADERS32
mov edx, [ebx + 0x78] ; OptionalHeader->DataDirectory[0]
add edx, eax ; RVA -> VA
mov ebx, [edx + 0x18] ; NumberOfNames
dec ebx
mov esi, [edx + 0x20] ; AddressOfNames
add esi, eax
loop_til_loadlibrary_found:
test ebx, ebx
je cleanup
mov edi, [esi + ebx * 4] ; AddressOfNames[ebx]
add edi, eax
mov ecx, [edi]
cmp ecx, 0x64616F4C ; Load
jne not_loadlibrary
mov ecx, [edi + 4]
cmp ecx, 0x7262694C ; Libr
jne not_loadlibrary
mov ecx, [edi + 8]
cmp ecx, 0x41797261 ; aryA
jne not_loadlibrary
jmp loadlibrary_found
not_loadlibrary:
dec ebx
jmp loop_til_loadlibrary_found
loadlibrary_found:
mov esi, [edx+0x24] ; AddressOfNameOrdinals
add esi, eax
movzx edi, word ptr [esi + ebx * 2]
mov esi, [edx+0x1C] ; AddressOfFunctions
add esi, eax
mov ecx, [esi + edi * 4]
add ecx, eax
sub esp, 12
mov byte ptr [esp], 0x73
mov byte ptr [esp + 1], 0x68
mov byte ptr [esp + 2], 0x65
mov byte ptr [esp + 3], 0x6C
mov byte ptr [esp + 4], 0x6C
mov byte ptr [esp + 5], 0x33
mov byte ptr [esp + 6], 0x32
mov byte ptr [esp + 7], 0x2E
mov byte ptr [esp + 8], 0x64
mov byte ptr [esp + 9], 0x6C
mov byte ptr [esp + 10], 0x6C
mov byte ptr [esp + 11], 0x00
lea ebx, [esp] ; shell32.dll
push ebx
call ecx
add esp, 12
test eax, eax
jz cleanup
mov ebx, [eax + 0x3C]
add ebx, eax
mov edx, [ebx + 0x78]
add edx, eax
mov ebx, [edx + 0x18]
dec ebx
mov esi, [edx + 0x20]
add esi, eax
loop_til_shellexecute_found:
test ebx, ebx
je cleanup
mov edi, [esi + ebx * 4]
add edi, eax
mov ecx, [edi]
cmp ecx, 0x6C656853 ; Shel
jne not_shellexecute
mov ecx, [edi + 4]
cmp ecx, 0x6578456C ; lExe
jne not_shellexecute
mov ecx, [edi + 8]
cmp ecx, 0x65747563 ; cute
jne not_shellexecute
cmp byte ptr [edi + 12], 0x41 ; A
jne not_shellexecute
jmp shellexecute_found
not_shellexecute:
dec ebx
jmp loop_til_shellexecute_found
shellexecute_found :
mov esi, [edx + 0x24]
add esi, eax
movzx edi, word ptr [esi + ebx * 2]
mov esi, [edx + 0x1C]
add esi, eax
mov ecx, [esi + edi * 4]
add ecx, eax
sub esp, 10
mov byte ptr [esp], 0x6F
mov byte ptr [esp + 1], 0x70
mov byte ptr [esp + 2], 0x65
mov byte ptr [esp + 3], 0x6E
mov byte ptr [esp + 4], 0x00
lea ebx, [esp] ; open
mov byte ptr [esp + 5], 0x63
mov byte ptr [esp + 6], 0x61
mov byte ptr [esp + 7], 0x6C
mov byte ptr [esp + 8], 0x63
mov byte ptr [esp + 9], 0x00
lea edx, [esp + 5] ; calc
push 0 ; nShowCmd
push 0 ; lpDirectory
push 0 ; lpParameters
push edx ; lpFile
push ebx ; lpOperation
push 0 ; hwnd
call ecx
add esp, 10
cleanup:
xor eax, eax
xor ebx, ebx
xor ecx, ecx
xor edx, edx
xor esi, esi
xor edi, ediChapter 6: Final words
This was an interesting journey of learning more about OS internals and how to write position-independent code.
However, my implementation is far from perfect, it is the result of two days of experimentation and it's not intended to work on all environments.
It has been tested and ran on Windows 11 but I cannot guarantee it working on any other system.