Sunday, January 28, 2007

Thread safe TStack (TThreadStack)

I've moved my blog and it's post to my new blog, please go to Thread safe TStack (TThreadStack) on Landman Code Recently i've been doing a lot of multithreading, and luckily Delphi provides with a handy bunch of classes to make the developers life easier. You've got some basic synchronization classes (TMutex, TEvent, TCriticalSection, TMultiReadExclusiveWriteSynchronizer...) and an basic data container (TThreadList). But for one program I needed a thread safe stack. In this post I will describe how I created my own.

There is not thread safe stack in Borland Turbo Delphi 2006, so off course I started with a Google for delphi TThreadStack, which at this time gives zero results. Searching the newsgroups I found an interesting group called comp.programming.threads, searching that group I found one Pascal Lock-Free stack (which basically means not using a critical section, or any other mechanism, to lock the data). But after testing it, FastMM pointed out an memory leak. I provided the author with a test case to cause the memory leak, but after 5 months no reply.

But I'm a hard person to please, and although it is indeed lock free, I was wondering if in my situation this lock-free solution wasn't to complicated and perhaps slower. So I created an simple TThreadStack from looking at the principle of the TThreadList (Very simple!), and compared it with the lock-free solution.

The source demonstrates is the test project.

program Project2;

{$APPTYPE CONSOLE}

uses
  FastMM4,
  unThreadStack,
  FreeStack, Math,
  Windows, SysUtils, Classes;
const
  NumberOfAllocations = 1000;
  PopTimeOut = 1;
  PushTimeOut = 1;
  NumberOfThreads = 8; // must be an multiple of 4
  NumberOfTest = 10;
type
  TTestRec = packed record
    BigField: array[0..254] of Char;
    SmallerField: Extended;
    SmallField: Byte;
  end;
  PTestRec = ^TTestRec;

  TFreeStackThread = class(TThread)
    FPopper: Boolean;
    FDestination: TFreeStack;
    FFinished: THandle;
  protected
    procedure Execute; override;
  public
    constructor Create(ADestination: TFreeStack; APopper: Boolean; AFinished: THandle);
  end;

  TThreadStackThread = class(TThread)
    FPopper: Boolean;
    FDestination: TThreadStack;
    FFinished: THandle;
  protected
    procedure Execute; override;
  public
    constructor Create(ADestination: TThreadStack; APopper: Boolean; AFinished: THandle);
  end;

  { TThreadStackThread }

constructor TThreadStackThread.Create(ADestination: TThreadStack;
  APopper: Boolean; AFinished: THandle);
begin
  FDestination := ADestination;
  FPopper := APopper;
  FFinished := AFinished;
  inherited Create(False);
end;

procedure TThreadStackThread.Execute;
var
  TempStack: TStack;
  p: PTestRec;
  counter: Int64;
  temp: LongWord;
begin
  counter := 0;
  while (not Terminated) and (counter < NumberOfAllocations) do
  begin
    TempStack := FDestination.LockStack;
    try
      if FPopper then
      begin
        if TempStack.Count > 0 then
        begin
          Dispose(PTestRec(TempStack.Pop));
          inc(counter);
        end;
      end
      else
      begin
        New(p);
        TempStack.Push(p);
        inc(counter);
      end;
    finally
      FDestination.UnlockStack;
    end;
    if FPopper then
      Sleep(PopTimeOut)
    else
      Sleep(PushTimeOut);
  end;
  ReleaseSemaphore(FFinished, 1, @temp);
end;

{ TFreeStackThread }

constructor TFreeStackThread.Create(ADestination: TFreeStack; APopper: Boolean; AFinished: THandle);
begin
  FDestination := ADestination;
  FPopper := APopper;
  FFinished := AFinished;
  inherited Create(False);
end;

procedure TFreeStackThread.Execute;
var
  p: PTestRec;
  counter: Int64;
  temp: LongWord;
begin
  counter := 0;
  p := nil;
  while (not Terminated) and (counter < NumberOfAllocations) do
  begin
    if FPopper then
    begin
      if FDestination.Count > 0 then
      begin
        if FDestination.Pop(TObject(p)) then
        begin
          Dispose(p);
          inc(counter);
        end;
      end;
    end
    else
    begin
      if p = nil then
        New(p);
      if FDestination.Push(TObject(p)) then
      begin
        p := nil;
        inc(counter);
      end;
    end;
    if FPopper then
      Sleep(PopTimeOut)
    else
      Sleep(PushTimeOut);
  end;
  ReleaseSemaphore(FFinished, 1, @temp);
end;

var
  AThreadStacksTests: array[0..NumberOfThreads - 1] of TThreadStackThread;
  AFreeStacksTests: array[0..NumberOfThreads - 1] of TFreeStackThread;
  i, j: Integer;
  Start, Stop, Freq: Int64;
  ThreadStacks: array[0..1] of TThreadStack;
  FreeStacks: array[0..1] of TFreeStack;
  ResultThread: array[0..NumberOfTest - 1] of Double;
  ResultFree: array[0..NumberOfTest - 1] of Double;
  Finished: THandle;
  Mean, StdDev: Extended;
begin
  ThreadStacks[0] := TThreadStack.Create;
  ThreadStacks[1] := TThreadStack.Create;
  FreeStacks[0] := TFreeStack.Create;
  FreeStacks[1] := TFreeStack.Create;
  Finished := CreateSemaphore(nil, 0, NumberOfThreads, 'Thread runners');
  Writeln('Starting ThreadStack threads');
  for j := 0 to NumberOfTest - 1 do
  begin
    QueryPerformanceCounter(Start);
    for I := 0 to NumberOfThreads - 1 do
      AThreadStacksTests[i] := TThreadStackThread.Create(ThreadStacks[i mod 2], (i mod 4) >= 2, Finished);
    for I := 0 to NumberOfThreads - 1 do
      WaitForSingleObject(Finished, INFINITE);
    QueryPerformanceCounter(Stop);
    for I := 0 to NumberOfThreads - 1 do
      AThreadStacksTests[i].Free;
    ResultThread[j] := Stop - Start;
  end;
  Writeln('ThreadStack done.');
  Writeln('Starting TFreeStack threads');
  for j := 0 to NumberOfTest - 1 do
  begin
    QueryPerformanceCounter(Start);
    for I := 0 to NumberOfThreads - 1 do
      AFreeStacksTests[i] := TFreeStackThread.Create(FreeStacks[i mod 2], (i mod 4) >= 2, Finished);
    for I := 0 to NumberOfThreads - 1 do
      WaitForSingleObject(Finished, INFINITE);
    QueryPerformanceCounter(Stop);
    for I := 0 to NumberOfThreads - 1 do
      AFreeStacksTests[i].Free;
    ResultFree[j] := Stop - Start;
  end;
  Writeln('TFreeStack done.');
  Writeln(Format('Calculating the mean and the standard deviation out of %d runs.', [NumberOfTest]));
  QueryPerformanceFrequency(Freq);
  MeanAndStdDev(ResultThread, Mean, StdDev);
  Writeln(Format('TThreadStack: %f (%f)', [Mean, StdDev]));
  Writeln(Format('TThreadStack: %fms (%fms)', [Mean / (Freq / 1000), StdDev / (Freq / 1000)]));
  MeanAndStdDev(ResultFree, Mean, StdDev);
  Writeln(Format('TFreeStack:   %f (%f)', [Mean, StdDev]));
  Writeln(Format('TFreeStack:   %fms (%fms)', [Mean / (Freq / 1000), StdDev / (Freq / 1000)]));
  if DebugHook <> 0 then
    Readln;
  { freeing everything}
  ThreadStacks[0].Free;
  ThreadStacks[1].Free;
  FreeStacks[0].Free;
  FreeStacks[1].Free;
  CloseHandle(Finished);
end.


This one also creates the memory leak. Running the code on a P4 2.26 returns this result:

Starting ThreadStack threads
ThreadStack done.
Starting TFreeStack threads
TFreeStack done.
Calculating the mean and the standard deviation out of 10 runs.
TThreadStack: 7152374,20 (26574,65)
TThreadStack: 1998,12ms (7,42ms)
TFreeStack: 7339945,80 (310570,01)
TFreeStack: 2050,52ms (86,76ms)

So the TFreestack is all most the same speed as the simple critical section based TThreadStack, although I know now the test isn't what you'd call regular, because it's constantly trying to push and pop. So perhaps a more normal situation would result differently. But in my program this situation was expected.

I also liked the simplicity of the TThreadStack above the complexity of the FreeStack, were you had to compile a piece TASM (containing the CAS) for it to work. Another show stopper was the memory leak. But actually I just wanted to post my simple TThreadStack so that next time I have to use it, I can just wander to my own blog. So without further ado, I bring you the following source.

unit unThreadStack;

interface
uses
  Windows, Contnrs;
type
  TStack = Contnrs.TStack;
  TThreadStack = class
  private
    FStack: TStack;
    FLock :TRTLCriticalSection;
  public
    constructor Create();
    destructor Destroy;  override;
    function LockStack : TStack;
    procedure UnlockStack;
    function Count: Integer;
    function Push(AItem: Pointer): Pointer;
    function Pop: Pointer;
    function Peek: Pointer;
  end;

implementation

{ TThreadStack }

function TThreadStack.LockStack: TStack;
begin
  EnterCriticalSection(FLock);
  Result := FStack;
end;

function TThreadStack.Count: Integer;
begin
  EnterCriticalSection(FLock);
  try
    Result := FStack.Count;
  finally
    LeaveCriticalSection(FLock);
  end;
end;

constructor TThreadStack.Create();
begin
  inherited Create();
  InitializeCriticalSection(FLock);
  FStack := Contnrs.TStack.Create;
end;

destructor TThreadStack.Destroy;
begin
  DeleteCriticalSection(FLock);
  FStack.Free;
  inherited;
end;

function TThreadStack.Peek: Pointer;
begin
  EnterCriticalSection(FLock);
  try
    Result := FStack.Peek;
  finally
    LeaveCriticalSection(FLock);
  end;
end;

function TThreadStack.Pop: Pointer;
begin
  EnterCriticalSection(FLock);
  try
    Result := FStack.Pop;
  finally
    LeaveCriticalSection(FLock);
  end;
end;

function TThreadStack.Push(AItem: Pointer): Pointer;
begin
  EnterCriticalSection(FLock);
  try
    Result := FStack.Push(AItem);
  finally
    LeaveCriticalSection(FLock);
  end;
end;

procedure TThreadStack.UnlockStack;
begin
  LeaveCriticalSection(FLock);
end;

end.

If you use this source, I would like it if you just left a comment on this blog.

Friday, April 21, 2006

Fast reading of files using Memory Mapping

I've moved my blog and it's post to my new blog, please go to Fast reading of files using Memory Mapping on Landman Code. It has been six months since I last posted something. Lets just say things got a little busy J. And posting source code on Blogspot seemed to be a bitch because blogspot would filter out the enters. I solved that in the previous post by using an <br /> as an enter. But when copying and pasting from the page the newlines were lost (offcourse DelForExp fixes that.. but still it sucked).

Now I have just a little bit of time, and a few articles I wanted to post. So after some testing I found out blogspot fixed the enter removal and now I’ll try to post more frequently.

Now let’s get ontopic, Memory Mapped Files can be very helpful for reading large files. Looking through the internet you can find many advantages and disadvantages. The important thing is, think about what your doing, MMF can be very fast in one application. But slow in an other, it all depends on the situation, there are enough articles about the subject (for instance this one by the Delphi Compiler Team)

I like MMF a lot when using binary files of a certain format. Let’s assume we have the following file format:

TCustomerStruct = packed record
    CustomerID: Longword;
    CustomerName: array[0..254] of Char;
    CustomerBirthDay: TDateTime;
    CustomerRate: Double;
    AccountManagerID: Longword;
  end;

You could read this using BlockRead:

var
  CustomerFile: file of TCustomerStruct;
  Customers: array of TCustomerStruct;
  i : integer;
begin
  AssignFile(CustomerFile,'c:\customers.cus');
  try
    Reset(CustomerFile); // open the file for reading
    SetLength(Customers, FileSize(CustomerFile)); // create the array
    BlockRead(CustomerFile, Customers, Length(Customers));  // Read the hole party in to the array
    for i := 0 to High(Customers) do
    // List all the customers in a memo
      memCustomerList.Lines.Add('Name: '+ Customers[i].CustomerName);
  finally
    CloseFile(CustomerFile);
  end;

And now using MemoryMapping:

type
  TCustomerStructArray = array[0..MaxInt div SizeOf(TCustomerStruct) - 1] of TCustomerStruct;
  PCustomerStructArray = ^TCustomerStructArray;
var
  CustomerFile : TMappedFile;
  Customers: PCustomerStructArray;
  i : integer;
begin
  CustomerFile := TMappedFile.Create;
  try
    CustomerFile.MapFile('c:\customers.cus');
    Customers := PCustomerStructArray(CustomerFile.Content); // not needed, but handy
    for i := 0 to CustomerFile.Size div SizeOf(TCustomerStruct) -1 do
      memCustomerList.Lines.Add('Name: '+ Customers[i].CustomerName);
  finally
    CustomerFile.Free;
  end;

The MaxInt div SizeOf(TCustomerStruct) – 1 is the maximum amount of records (thus memory) loaded at once.

The TMappedFile class is something I created myself so I can be lazy. Off course I will share that piece of code too.

unit unFileMapping;
{
Copyright (c) 2005-2006 by Davy Landman

See the file COPYING.FPC, included in this distribution,
for details about the copyright. Alternately, you may use this source under the provisions of MPL v1.x or later

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
}

interface
uses
  Windows, SysUtils;
type
  TMappedFile = class
  private
    FMapping: THandle;
    FContent: Pointer;
    FSize: Integer;
    procedure MapFile(const AFileName: WideString);
  public
    constructor Create(const AFileName: WideString);
    destructor Destroy; override;
    property Content: Pointer read FContent;
    property Size: Integer read FSize;
  end;

implementation

function FileExistsLongFileNames(const FileName: WideString): Boolean;
begin
  if Length(FileName) < 2 then
  begin
    Result := False;
    Exit;
  end;
  if CompareMem(@FileName[1], @WideString('\\')[1], 2) then
    Result := (GetFileAttributesW(PWideChar(FileName)) and FILE_ATTRIBUTE_DIRECTORY = 0)
  else
    Result := (GetFileAttributesW(PWideChar(WideString('\\?\' + FileName))) and FILE_ATTRIBUTE_DIRECTORY = 0)
end;

{ TMappedFile }



constructor TMappedFile.Create(const AFileName: WideString);
begin
  inherited Create;
  if FileExistsLongFileNames(AFileName) then
    MapFile(AFileName)
  else
    raise Exception.Create('File "' + AFileName + '" does not exists.');
end;

destructor TMappedFile.Destroy;
begin
  if Assigned(FContent) then
  begin
    UnmapViewOfFile(FContent);
    CloseHandle(FMapping);
  end;
  inherited;
end;

procedure TMappedFile.MapFile(const AFileName: WideString);
var
  FileHandle: THandle;
begin
  if CompareMem(@(AFileName[1]), @('\\'[1]), 2) then
    { Allready an UNC path }
    FileHandle := CreateFileW(PWideChar(AFileName), GENERIC_READ, FILE_SHARE_READ or
      FILE_SHARE_WRITE, nil, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0)
  else
    FileHandle := CreateFileW(PWideChar(WideString('\\?\' + AFileName)), GENERIC_READ, FILE_SHARE_READ or
      FILE_SHARE_WRITE, nil, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
  if FileHandle <> 0 then
  try
    FSize := GetFileSize(FileHandle, nil);
    if FSize <> 0 then
    begin
      FMapping := CreateFileMappingW(FileHandle, nil, PAGE_READONLY, 0, 0, nil);
      //Win32Check(FMapping <> 0);
    end;
  finally
    CloseHandle(FileHandle);
  end;
  if FSize = 0 then
    FContent := nil
  else
    FContent := MapViewOfFile(FMapping, FILE_MAP_READ, 0, 0, 0);
  //Win32Check(FContent <> nil);
end;

end.

The big advantage is, that with BlockRead you can either read the whole content of the file in the array, or buffering the file in blocks. With MMF there is no need to worry about it (unless you get very big files), Windows automatically arranges the memory when requested.

Thursday, August 18, 2005

Getting (possibly) 500% speed gain on divisions

I've moved my blog and it's post to my new blog, please go to Getting (possibly) 500% speed gain on divisions on Landman Code.One of my fellow students once made a remark about the old days when a division was much faster if it was written as multiplication.

This means, instead of devising with a 100, you multiply with 0.01 (1/100). Because some programs of mine have a lot of divisions in their core loops, I investigated the difference.

I created the following test program.

program DivVsMult
 
{$APPTYPE CONSOLE} 
 
uses 
SysUtils
Windows
 
var 
Start,Stop, Start2, Stop2, Freq:int64
i : integer
t : real
CpuSpeed : integer
begin 
{ TODO -oUser -cConsole Main : Insert code here } 
{ Cpu Speed fastes cpu = 1 slower => 10 
it's just to determin the number of time to do the loop 
Maxint div CpuSpeed is calculated } 
if ParamCount = 1 then 
CpuSpeed := StrToIntDef(ParamStr(1),1
else 
CpuSpeed := 10
Writeln('Simple Number division:'); 
Writeln('Calculating'); 
QueryPerformanceFrequency(freq); 
QueryPerformanceCounter(Start); 
for i:=0 to MaxInt div CpuSpeed do 
t := i * (1/100); 
QueryPerformanceCounter(Stop); 
Writeln(Format('First Pass Result: %f',[t])); 
{ This is needed because the compiler would optimize, 
and would notice the result of the loop isn't used at all, 
so therefor the result is useless.. so depending on the compiler, it will 
choose what to do with it, this disables that optimization } 
QueryPerformanceCounter(Start2); 
for i:=0 to MaxInt div CpuSpeed do 
t := i * (1/100); 
QueryPerformanceCounter(Stop2); 
Writeln(Format('Second Pass Result: %15.6f',[t])); 
{ This is needed because the compiler would optimize, 
and would notice the result of the loop isn't used at all, 
so therefor the result is useless.. so depending on the compiler, it will 
choose what to do with it, this disables that optimization } 
Writeln('Done, Results:'); 
Writeln(Format('/ 100 Time: %6.4f seconds'+#13#10
'/ 100 Clock: %d ticks'+#13#10
'* 0.01 Time: %6.4f seconds'+#13#10
'* 0.01 Clock: %d ticks',[(Stop-Start) / freq, , (Stop2-Start2)])); 
Writeln
Writeln('Odd Number division:'); 
QueryPerformanceCounter(Start); 
for i:=0 to high(i) div CpuSpeed do 
t := i / 556
QueryPerformanceCounter(Stop); 
Writeln(Format('First Pass Result: %15.6f',[t])); 
{ This is needed because the compiler would optimize, 
and would notice the result of the loop isn't used at all, 
so therefor the result is useless.. so depending on the compiler, it will 
choose what to do with it, this disables that optimization } 
QueryPerformanceCounter(Start2); 
for i:=0 to high(i) div CpuSpeed do 
t := i * (1/556); 
QueryPerformanceCounter(Stop2); 
Writeln(Format('Second Pass Result: %15.6f',[t])); 
Writeln(Format('/ 556 Time: %6.4f seconds'+#13#10
'/ 556 Clock: %d ticks'+#13#10
'* (1/556) Time: %6.4f seconds'+#13#10
'* (1/556) Clock: %d ticks',[(Stop-Start) / freq, (Stop-Start), (Stop2-Start2) / freq, (Stop2-Start2)])); 
Writeln(Format(' (1/556) = %15.14f (approximate)',[1/556])); 
// Readln; 
 
end
 

On an old P3 900Mhz:

Simple Number division:
Calculating
First Pass Result: 2147483,64
Second Pass Result: 2147483,64
Done, Results:
/ 100 Time: 10,3319 seconds
/ 100 Clock: 36983482 ticks
* 0.01 Time: 2,0378 seconds
* 0.01 Clock: 7294251 ticks

Odd Number division:
First Pass Result: 386238,0647482014610000
Second Pass Result: 386238,0647482014610000
Done, Results:
/ 556 Time: 10,0735 seconds
/ 556 Clock: 36058581 ticks
* (1/556) Time: 2,0446 seconds
* (1/556) Clock: 7318775 ticks
(1/556) = 0,0017985611510791 (approximate)

On a new P4 2.3 Ghz:

Simple Number division:
Calculating
First Pass Result: 2147483.64
Second Pass Result: 2147483.64
Done, Results:
/ 100 Time: 4.6227 seconds
/ 100 Clock: 16547055 ticks
* 0.01 Time: 1.0782 seconds
* 0.01 Clock: 3859508 ticks

Odd Number division:
First Pass Result: 386238.064748
Second Pass Result: 386238.064748
Done, Results:
/ 556 Time: 4.5820 seconds
/ 556 Clock: 16401425 ticks
* (1/556) Time: 12.1746 seconds
* (1/556) Clock: 43579366 ticks
(1/556) = 0.00179856115108 (approximate)

The results are variating, on simple numbers like 0.01 the speedup is allways working, but somehow the very complex numbers tend to be slower sometimes.

I use this tip allot when working with percentage.