Best winapi questions in May 2011

Checking digital signature programmatically from Delphi

10 votes

I need a function in Delphi to verify the digital signature of an external EXE or DLL. In my particular application, I am going to occasionally invoke other processes, but for security purposes I want to make sure these executables were created by our organization before running them.

I have seen Microsoft's example in C, however, I do not want to waste the time translating this to Delphi if somebody else already has.

I would prefer a snippet or code example over a third-party library. Thanks.

Here you go:

// IsCodeSigned, which verifies that the exe hasn't been modified, uses
// WinVerifyTrust, so it's NT only.  IsCodeSignedByUs works on Win9x, but it
// only checks that the signing certificate hasn't been replaced, which keeps
// someone from re-signing a modified executable.

// Imagehlp.dll
const
  CERT_SECTION_TYPE_ANY = $FF;      // Any Certificate type

function ImageEnumerateCertificates(FileHandle: THandle; TypeFilter: WORD;
  out CertificateCount: DWORD; Indicies: PDWORD; IndexCount: Integer): BOOL; stdcall; external 'Imagehlp.dll';
function ImageGetCertificateHeader(FileHandle: THandle; CertificateIndex: Integer;
  var CertificateHeader: TWinCertificate): BOOL; stdcall; external 'Imagehlp.dll';
function ImageGetCertificateData(FileHandle: THandle; CertificateIndex: Integer;
  Certificate: PWinCertificate; var RequiredLength: DWORD): BOOL; stdcall; external 'Imagehlp.dll';

// Crypt32.dll
const
  CERT_NAME_SIMPLE_DISPLAY_TYPE = 4;
  PKCS_7_ASN_ENCODING = $00010000;
  X509_ASN_ENCODING = $00000001;

type
  PCCERT_CONTEXT = type Pointer;
  HCRYPTPROV_LEGACY = type Pointer;
  PFN_CRYPT_GET_SIGNER_CERTIFICATE = type Pointer;

  CRYPT_VERIFY_MESSAGE_PARA = record
    cbSize: DWORD;
    dwMsgAndCertEncodingType: DWORD;
    hCryptProv: HCRYPTPROV_LEGACY;
    pfnGetSignerCertificate: PFN_CRYPT_GET_SIGNER_CERTIFICATE;
    pvGetArg: Pointer;
  end;

function CryptVerifyMessageSignature(const pVerifyPara: CRYPT_VERIFY_MESSAGE_PARA;
  dwSignerIndex: DWORD; pbSignedBlob: PByte; cbSignedBlob: DWORD; pbDecoded: PBYTE;
  pcbDecoded: PDWORD; ppSignerCert: PCCERT_CONTEXT): BOOL; stdcall; external 'Crypt32.dll';
function CertGetNameStringA(pCertContext: PCCERT_CONTEXT; dwType: DWORD; dwFlags: DWORD; pvTypePara: Pointer;
  pszNameString: PAnsiChar; cchNameString: DWORD): DWORD; stdcall; external 'Crypt32.dll';
function CertFreeCertificateContext(pCertContext: PCCERT_CONTEXT): BOOL; stdcall; external 'Crypt32.dll';
function CertCreateCertificateContext(dwCertEncodingType: DWORD;
  pbCertEncoded: PBYTE; cbCertEncoded: DWORD): PCCERT_CONTEXT; stdcall; external 'Crypt32.dll';

// WinTrust.dll
const
  WINTRUST_ACTION_GENERIC_VERIFY_V2: TGUID = '{00AAC56B-CD44-11d0-8CC2-00C04FC295EE}';
  WTD_CHOICE_FILE = 1;
  WTD_REVOKE_NONE = 0;
  WTD_UI_NONE = 2;

type
  PWinTrustFileInfo = ^TWinTrustFileInfo;
  TWinTrustFileInfo = record
    cbStruct: DWORD;                    // = sizeof(WINTRUST_FILE_INFO)
    pcwszFilePath: PWideChar;           // required, file name to be verified
    hFile: THandle;                     // optional, open handle to pcwszFilePath
    pgKnownSubject: PGUID;              // optional: fill if the subject type is known
  end;

  PWinTrustData = ^TWinTrustData;
  TWinTrustData = record
    cbStruct: DWORD;
    pPolicyCallbackData: Pointer;
    pSIPClientData: Pointer;
    dwUIChoice: DWORD;
    fdwRevocationChecks: DWORD;
    dwUnionChoice: DWORD;
    pFile: PWinTrustFileInfo;
    dwStateAction: DWORD;
    hWVTStateData: THandle;
    pwszURLReference: PWideChar;
    dwProvFlags: DWORD;
    dwUIContext: DWORD;
  end;

function WinVerifyTrust(hwnd: HWND; const ActionID: TGUID; ActionData: Pointer): Longint; stdcall; external wintrust;

{-----------------------------------------------}

function IsCodeSigned(const Filename: string): Boolean;
var 
  file_info: TWinTrustFileInfo;
  trust_data: TWinTrustData;
begin
  // Verify that the exe is signed and the checksum matches
  FillChar(file_info, SizeOf(file_info), 0);
  file_info.cbStruct := sizeof(file_info);
  file_info.pcwszFilePath := PWideChar(WideString(Filename));
  FillChar(trust_data, SizeOf(trust_data), 0);
  trust_data.cbStruct := sizeof(trust_data);
  trust_data.dwUIChoice := WTD_UI_NONE;
  trust_data.fdwRevocationChecks := WTD_REVOKE_NONE;
  trust_data.dwUnionChoice := WTD_CHOICE_FILE;
  trust_data.pFile := @file_info;
  Result := WinVerifyTrust(INVALID_HANDLE_VALUE, WINTRUST_ACTION_GENERIC_VERIFY_V2,
    @trust_data) = ERROR_SUCCESS
end;

{-----------------------------------------------}

function IsCompanySigningCertificate(const Filename, CompanyName :string): Boolean;
var
  hExe: HMODULE;
  Cert: PWinCertificate;
  CertContext: PCCERT_CONTEXT;
  CertCount: DWORD;
  CertName: AnsiString;
  CertNameLen: DWORD;
  VerifyParams: CRYPT_VERIFY_MESSAGE_PARA;
begin
  // Returns TRUE if the SubjectName on the certificate used to sign the exe is
  // "Company Name".  Should prevent a cracker from modifying the file and
  // re-signing it with their own certificate.
  //
  // Microsoft has an example that does this using CryptQueryObject and
  // CertFindCertificateInStore instead of CryptVerifyMessageSignature, but
  // CryptQueryObject is NT-only.  Using CertCreateCertificateContext doesn't work
  // either, though I don't know why.
  Result := False;
  // Verify that the exe was signed by our private key
  hExe := CreateFile(PChar(Filename), GENERIC_READ, FILE_SHARE_READ,
    nil, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL or FILE_FLAG_RANDOM_ACCESS, 0);
  if hExe = INVALID_HANDLE_VALUE then
    Exit;
  try
    // There should only be one certificate associated with the exe
    if (not ImageEnumerateCertificates(hExe, CERT_SECTION_TYPE_ANY, CertCount, nil, 0)) or
       (CertCount <> 1) then
      Exit;
    // Read the certificate header so we can get the size needed for the full cert
    GetMem(Cert, SizeOf(TWinCertificate) + 3); // ImageGetCertificateHeader writes an DWORD at bCertificate for some reason
    try
      Cert.dwLength := 0;
      Cert.wRevision := WIN_CERT_REVISION_1_0;
      if not ImageGetCertificateHeader(hExe, 0, Cert^) then
        Exit;
      // Read the full certificate
      ReallocMem(Cert, SizeOf(TWinCertificate) + Cert.dwLength);
      if not ImageGetCertificateData(hExe, 0, Cert, Cert.dwLength) then
        Exit;
      // Get the certificate context.  CryptVerifyMessageSignature has the
      // side effect of creating a context for the signing certificate.
      FillChar(VerifyParams, SizeOf(VerifyParams), 0);
      VerifyParams.cbSize := SizeOf(VerifyParams);
      VerifyParams.dwMsgAndCertEncodingType := X509_ASN_ENCODING or PKCS_7_ASN_ENCODING;
      if not CryptVerifyMessageSignature(VerifyParams, 0, @Cert.bCertificate,
         Cert.dwLength, nil, nil, @CertContext) then
        Exit;
      try
        // Extract and compare the certificate's subject names.  Don't
        // compare the entire certificate or the public key as those will
        // change when the certificate is renewed.
        CertNameLen := CertGetNameStringA(CertContext,
          CERT_NAME_SIMPLE_DISPLAY_TYPE, 0, nil, nil, 0);
        SetLength(CertName, CertNameLen - 1);
        CertGetNameStringA(CertContext, CERT_NAME_SIMPLE_DISPLAY_TYPE, 0,
          nil, PAnsiChar(CertName), CertNameLen);
        if CertName <> CompanyName then 
          Exit;
      finally
        CertFreeCertificateContext(CertContext)
      end;
    finally
      FreeMem(Cert);
    end;
  finally
    CloseHandle(hExe);
  end;
  Result := True;
end;

Explanation for tiny reads (overlapped, buffered) outperforming large contiguous reads?

9 votes

(apologies for the somewhat lengthy intro)

During development of an application which prefaults an entire large file (>400MB) into the buffer cache for speeding up the actual run later, I tested whether reading 4MB at a time still had any noticeable benefits over reading only 1MB chunks at a time. Surprisingly, the smaller requests actually turned out to be faster. This seemed counter-intuitive, so I ran a more extensive test.

The buffer cache was purged before running the tests (just for laughs, I did one run with the file in the buffers, too. The buffer cache delivers upwards of 2GB/s regardless of request size, though with a surprising +/- 30% random variance).
All reads used overlapped ReadFile with the same target buffer (the handle was opened with FILE_FLAG_OVERLAPPED and without FILE_FLAG_NO_BUFFERING). The harddisk used is somewhat elderly but fully functional, NTFS has a cluster size of 8kB. The disk was defragmented after an initial run (6 fragments vs. unfragmented, zero difference). For better figures, I used a larger file too, below numbers are for reading 1GB.

The results were really surprising:

4MB x 256    : 5ms per request,    completion 25.8s @ ~40 MB/s
1MB x 1024   : 11.7ms per request, completion 23.3s @ ~43 MB/s
32kB x 32768 : 12.6ms per request, completion 15.5s @ ~66 MB/s
16kB x 65536 : 12.8ms per request, completion 13.5s @ ~75 MB/s

So, this suggests that submitting ten thousands of requests two clusters in length is actually better than submitting a few hundred large, contiguous reads. The submit time (time before ReadFile returns) does go up substantially as the number of requests goes up, but asynchronous completion time nearly halves.
Kernel CPU time is around 5-6% in every case (on a quadcore, so one should really say 20-30%) while the asynchronous reads are completing, which is a surprising amount of CPU -- apparently the OS does some non-neglegible amount of busy waiting, too. 30% CPU for 25 seconds at 2.6 GHz, that's quite a few cycles for doing "nothing".

Any idea how this can be explained? Maybe someone here has a deeper insight of the inner workings of Windows overlapped IO? Or, is there something substantially wrong with the idea that you can use ReadFile for reading a megabyte of data?

I can see how an IO scheduler would be able to optimize multiple requests by minimizing seeks, especially when requests are random access (which they aren't!). I can also see how a harddisk would be able to perform a similar optimization given a few requests in the NCQ.
However, we're talking about ridiculous numbers of ridiculously small requests -- which nevertheless outperform what appears to be sensible by a factor of 2.

Sidenote: The clear winner is memory mapping. I'm almost inclined to add "unsurprisingly" because I am a big fan of memory mapping, but in this case, it actually does surprise me, as the "requests" are even smaller and the OS should be even less able to predict and schedule the IO. I didn't test memory mapping at first because it seemed counter-intuitive that it might be able to compete even remotely. So much for your intuition, heh.

Mapping/unmapping a view repeatedly at different offsets takes practically zero time. Using a 16MB view and faulting every page with a simple for() loop reading a single byte per page completes in 9.2 secs @ ~111 MB/s. CPU usage is under 3% (one core) at all times. Same computer, same disk, same everything.

It also appears that Windows loads 8 pages into the buffer cache at a time, although only one page is actually created. Faulting every 8th page runs at the same speed and loads the same amount of data from disk, but shows lower "physical memory" and "system cache" metrics and only 1/8 of the page faults. Subsequent reads prove that the pages are nevertheless definitively in the buffer cache (no delay, no disk activity).

(Possibly very, very distantly related to Memory-Mapped File is Faster on Huge Sequential Read?)

To make it a bit more illustrative:
enter image description here

Update:

Using FILE_FLAG_SEQUENTIAL_SCAN seems to somewhat "balance" reads of 128k, improving performance by 100%. On the other hand, it severely impacts reads of 512k and 256k (you have to wonder why?) and has no real effect on anything else. The MB/s graph of the smaller blocks sizes arguably seems a little more "even", but there is no difference in runtime.

enter image description here

I may have found an explanation for smaller block sizes performing better, too. As you know, asynchronous requests may run synchronously if the OS can serve the request immediately, i.e. from the buffers (and for a variety of version-specific technical limitations).

When accounting for actual asynchronous vs. "immediate" asyncronous reads, one notices that upwards of 256k, Windows runs every asynchronous request asynchronously. The smaller the blocksize, the more requests are being served "immediately", even when they are not available immediately (i.e. ReadFile simply runs synchronously). I cannot make out a clear pattern (such as "the first 100 requests" or "more than 1000 requests"), but there seems to be an inverse correlation between request size and synchronicity. At a blocksize of 8k, every asynchronous request is served synchronously.
Buffered synchronous transfers are, for some reason, twice as fast as asynchronous transfers (no idea why), hence the smaller the request sizes, the faster the overall transfer, because more transfers are done synchronously.

For memory mapped prefaulting, FILE_FLAG_SEQUENTIAL_SCAN causes a slightly different shape of the performance graph (there is a "notch" which is moved a bit backwards), but the total time taken is exactly identical (again, this is surprising, but I can't help it).

Update 2:

Unbuffered IO makes the performance graphs for the 1M, 4M, and 512k request testcases somewhat higher and more "spiky" with maximums in the 90s of GB/s, but with harsh minumums too, the overall runtime for 1GB is within +/- 0.5s of the buffered run (the requests with smaller buffer sizes complete significantly faster, however, that is because with more than 2558 in-flight requests, ERROR_WORKING_SET_QUOTA is returned). Measured CPU usage is zero in all unbuffered cases, which is unsurprising, since any IO that happens runs via DMA.

Another very interesting observation with FILE_FLAG_NO_BUFFERING is that it significantly changes API behaviour. CancelIO does not work any more, at least not in a sense of cancelling IO. With unbuffered in-flight requests, CancelIO will simply block until all requests have finished. A lawyer would probably argue that the function cannot be held liable for neglecting its duty, because there are no more in-flight requests left when it returns, so in some way it has done what was asked -- but my understanding of "cancel" is somewhat different.
With buffered, overlapped IO, CancelIO will simply cut the rope, all in-flight operations terminate immediately, as one would expect.

Yet another funny thing is that the process is unkillable until all requests have finished or failed. This kind of makes sense if the OS is doing DMA into that address space, but it's a stunning "feature" nevertheless.

I'm not a filesystem expert but I think there are a couple of things going on here. First off. w.r.t. your comment about memory mapping being the winner. This isn't totally surprising since the NT cache manager is based on memory mapping - by doing the memory mapping yourself, you're duplicating the cache manager's behavior without the additional memory copies.

When you read sequentially from the file, the cache manager attempts to pre-fetch the data for you - so it's likely that you are seeing the effect of readahead in the cache manager. At some point the cache manager stops prefetching reads (or rather at some point the prefetched data isn't sufficient to satisfy your reads and so the cache manager has to stall). That may account for the slowdown on larger I/Os that you're seeing.

Have you tried adding FILE_FLAG_SEQUENTIAL_SCAN to your CreateFile flags? That instructs the prefetcher to be even more aggressive.

This may be counter-intuitive, but traditionally the fastest way to read data off the disk is to use asynchronous I/O and FILE_FLAG_NO_BUFFERING. When you do that, the I/O goes directly from the disk driver into your I/O buffers with nothing to get in the way (assuming that the segments of the file are contiguous - if they're not, the filesystem will have to issue several disk reads to satisfy the application read request). Of course it also means that you lose the built-in prefetch logic and have to roll your own. But with FILE_FLAG_NO_BUFFERING you have complete control of your I/O pipeline.

One other thing to remember: When you're doing asynchronous I/O, it's important to ensure that you always have an I/O request oustanding - otherwise you lose potential time between when the last I/O completes and the next I/O is started.

How do I compute the non-client window size in WPF?

6 votes

WPF has the SystemParameters class that exposes a great number of system metrics. On my computer I have noticed that a normal window has a title that is 30 pixels high and a border that is 8 pixels wide. This is on Windows 7 with the Aero theme enabled:

Non-client area - Aero

However, SystemParameters return the following values:

SystemParameters.BorderWidth = 5
SystemParameters.CaptionHeight = 21

Here I have disabled the Aero theme:

Non-client area - classic

Now, SystemParameters return the following values:

SystemParameters.BorderWidth = 1
SystemParameters.CaptionHeight = 18

How do I compute the actual observed values by using SystemParameters?

For a resizable window you need to use a different set of parameters to compute the size:

var titleHeight = SystemParameters.WindowCaptionHeight
  + SystemParameters.ResizeFrameHorizontalBorderHeight;
var verticalBorderWidth = SystemParameters.ResizeFrameVerticalBorderWidth;

These sizes will change when you modify the theme.

What to do when FreeLibrary API call fails?

6 votes

Question

I have a third-party DLL that throws an unhandled exception when attempting to unload it from my native C application. This results in the call to FreeLibrary failing, and the module remaining loaded in my process.

Are there any options to forceably unload the library?

What do you do when the FreeLibrary calls?

Additional Background

When using load-time dynamic linking this is annoying enough, but ultimately the application gets torn down by the OS. The problem comes when using run-time dynamic linking. I load up this DLL, use it, and then in some instances I need to unload it from my process's virtual address space and then continue running. When I call FreeLibrary on the third-party library, it does some cleanup work (i.e. in DllMain when DLL_PROCESS_DETACH is called). While it's doing it's cleanup, it causes an exception to be thrown which it doesn't handle, and bubbles up as an unhandled exception to FreeLibrary. This results in the call failing and the module remaining loaded.

I've put a ticket in with the vendor so hopefully I can get a fix which will allow this specific library to unload successfully. In case I don't however, and for the general case of this issue, I'm curious as to what the options are.

If you are after only unloading dll from the memory you can use

UnmapViewOfFile

providing bases address of your loaded dll as an argument.

Example:

HINSTANCE hInst = LoadLibrary( "path_to_dll" );

if( !FreeLibrary( hInst ) )
{
   fprintf( stderr, "Couldn't unload library. Error Code: %02X\n. Attempting to unmap...", GetLastError() );
   if( !UnmapViewOfFile( hInst ) )
   { 
     fprintf( stderr, "Couldn't unmap the file! Error Code: %02X\n", GetLastError( ) );
   }
}

Or if it's a library that you didn't explicitly load (e.g. a library dependency that was loaded by a library that you loaded) and you don't have the handle, then use GetModuleHandle:

HINSTANCE hInst = GetModuleHandle( "dllname_you_didn't_load" );
if( hInst != NULL )
{
   if( !UnmapViewOfFile( hInst ) )
   { 
     fprintf( stderr, "Couldn't unmap the file! Error Code: %02X\n", GetLastError( ) );
   }
}