Merge commit 'upstream/4.6.1'
[debian/atlc] / src / finite_difference_multi_threaded.c
diff --git a/src/finite_difference_multi_threaded.c b/src/finite_difference_multi_threaded.c
new file mode 100644 (file)
index 0000000..37f507b
--- /dev/null
@@ -0,0 +1,355 @@
+/* atlc - arbitrary transmission line calculator, for the analysis of
+transmission lines are directional couplers. 
+
+Copyright (C) 2002. Dr. David Kirkby, PhD (G8WRB).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either package_version 2
+of the License, or (at your option) any later package_version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+USA.
+
+Dr. David Kirkby, e-mail drkirkby at gmail.com 
+
+*/
+
+#include "config.h"
+
+#define SLOW
+
+#ifndef ENABLE_POSIX_THREADS 
+
+/* If we are not comiling for multi-threaded use, a function
+is defined, but it is not used. This keeps the linker happy */
+
+double finite_difference_multi_threaded()
+{
+  return(0.0);
+}
+#endif
+
+
+#ifdef ENABLE_POSIX_THREADS 
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+#include <pthread.h>
+
+#ifdef HAVE_SCHED_H
+#include <sched.h>
+#endif
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+
+#include "definitions.h"
+#include "exit_codes.h"
+
+extern int coupler;
+extern int width, height;
+extern double **Vij, **Er;
+extern signed char **oddity;
+extern int number_of_workers; 
+
+
+/* The algorithm for this is based on one in the book 'Foundations of Multithraded, 
+Parallel and Distributed Programming' by G. A. Andrews, (2000). Especially 
+chapter 11, Fig 11.6, "Red/Black Gauss-Seidel using shared vairables'. 
+
+Basically the array is a considered a chess board. Since we only use in the computation the
+values above, below, to the left and the right of the current pixel, these will all be the
+same colour on the chess board (black or white). So we compute all the white squares, which
+can all be done in parallel, since they don't depend on each other. Once the white squares
+are done, the black ones can be done. Again, these don't depend on the white squares, but
+only the black ones. So the algorithm does
+
+1) Covers the array with a large chess board with black and white squares. 
+
+
+B W B W B W B W B W B W B W B W B W B W B W B W B W B W
+W B W B W B W B W B W B W B W B W B W B W B W B W B W B
+B W B W B W B W B W B W B W B W B W B W B W B W B W B W 
+W B W B W B W B W B W B W B W B W B W B W B W B W B W B 
+B W B W B W B W B W B W B W B W B W B W B W B W B W B W 
+W B W B W B W B W B W B W B W B W B W B W B W B W B W B 
+B W B W B W B W B W B W B W B W B W B W B W B W B W B W 
+W B W B W B W B W B W B W B W B W B W B W B W B W B W B 
+B W B W B W B W B W B W B W B W B W B W B W B W B W B W 
+W B W B W B W B W B W B W B W B W B W B W B W B W B W B
+B W B W B W B W B W B W B W B W B W B W B W B W B W B W 
+W B W B W B W B W B W B W B W B W B W B W B W B W B W B
+
+
+2) Split the array into a number of columns, one for each CPU.
+
+B W B W B W B   W B W B W B W   B W B W B W B   W B W B W B W
+W B W B W B W   B W B W B W B   W B W B W B W   B W B W B W B
+B W B W B W B   W B W B W B W   B W B W B W B   W B W B W B W 
+W B W B W B W   B W B W B W B   W B W B W B W   B W B W B W B 
+B W B W B W B   W B W B W B W   B W B W B W B   W B W B W B W 
+W B W B W B W   B W B W B W B   W B W B W B W   B W B W B W B 
+B W B W B W B   W B W B W B W   B W B W B W B   W B W B W B W 
+W B W B W B W   B W B W B W B   W B W B W B W   B W B W B W B 
+B W B W B W B   W B W B W B W   B W B W B W B   W B W B W B W 
+W B W B W B W   B W B W B W B   W B W B W B W   B W B W B W B
+B W B W B W B   W B W B W B W   B W B W B W B   W B W B W B W 
+W B W B W B W   B W B W B W B   W B W B W B W   B W B W B W B
+
+3) Do all columns in parallel on the black squares. 
+
+B   B   B   B     B   B   B     B   B   B   B     B   B   B  
+  B   B   B     B   B   B   B     B   B   B     B   B   B   B
+B   B   B   B     B   B   B     B   B   B   B     B   B   B   
+  B   B   B     B   B   B   B     B   B   B     B   B   B   B 
+B   B   B   B     B   B   B     B   B   B   B     B   B   B   
+  B   B   B     B   B   B   B     B   B   B     B   B   B   B 
+B   B   B   B     B   B   B     B   B   B   B     B   B   B   
+  B   B   B     B   B   B   B     B   B   B     B   B   B   B 
+B   B   B   B     B   B   B     B   B   B   B     B   B   B   
+  B   B   B     B   B   B   B     B   B   B     B   B   B   B
+B   B   B   B     B   B   B     B   B   B   B     B   B   B   
+  B   B   B     B   B   B   B     B   B   B     B   B   B   B
+
+Thread 0          Thread 1        Thread 2         Thread 3
+
+4) Wait until all the black squares are done.
+
+5) Compute all the white squares in parallel. 
+
+  W   W   W     W   W   W   W     W   W   W     W   W   W   W
+W   W   W   W     W   W   W     W   W   W   W     W   W   W  
+  W   W   W     W   W   W   W     W   W   W     W   W   W   W 
+W   W   W   W     W   W   W     W   W   W   W     W   W   W   
+  W   W   W     W   W   W   W     W   W   W     W   W   W   W 
+W   W   W   W     W   W   W     W   W   W   W     W   W   W   
+  W   W   W     W   W   W   W     W   W   W     W   W   W   W 
+W   W   W   W     W   W   W     W   W   W   W     W   W   W   
+  W   W   W     W   W   W   W     W   W   W     W   W   W   W 
+W   W   W   W     W   W   W     W   W   W   W     W   W   W  
+  W   W   W     W   W   W   W     W   W   W     W   W   W   W 
+W   W   W   W     W   W   W     W   W   W   W     W   W   W  
+Thread 0          Thread 1        Thread 2         Thread 3
+
+
+6) Repeat a large number (typically 100) times. 
+
+7) Check for convergence. 
+
+8) Stop if the solution has converged. 
+*/
+
+extern double r;
+
+pthread_mutex_t barrier;  /* mutex semaphore for the barrier */
+pthread_cond_t go;        /* condition variable for leaving */
+int numArrived = 0;       /* count of the number who have arrived */
+
+#define BARRIER2 
+
+#ifdef BARRIER1
+void Barrier() {
+#ifndef DEBUG
+  pthread_mutex_lock(&barrier);
+#else
+  if( pthread_mutex_lock(&barrier) != 0 )
+    exit_with_msg_and_exit_code("pthread_mutex_lock failed",PTHREAD_MUTEX_LOCK_FAILED);
+#endif
+  
+  numArrived++;
+  if (numArrived == number_of_workers) {
+    numArrived = 0;
+    pthread_cond_broadcast(&go);
+  } else
+    pthread_cond_wait(&go, &barrier);
+#ifndef DEBUG
+  pthread_mutex_unlock(&barrier);
+#else
+  if( pthread_mutex_unlock(&barrier) != 0 )
+    exit_with_msg_and_exit_code("pthread_mutex_unlock failed",PTHREAD_MUTEX_UNLOCK_FAILED);
+#endif
+}
+#endif
+
+#ifdef BARRIER2
+int numArrived2[2] = {0, 0};
+int barrierNdx = 0;
+
+void Barrier() {
+  int localNdx;
+#ifndef DEBUG
+  pthread_mutex_lock(&barrier);
+#else
+  if( pthread_mutex_lock(&barrier) != 0 )
+    exit_with_msg_and_exit_code("pthread_mutex_lock failed in finite_difference_multi_threaded.c",PTHREAD_MUTEX_LOCK_FAILED);
+#endif
+  
+  numArrived2[barrierNdx]++;
+  if (numArrived2[barrierNdx] == number_of_workers) {
+    barrierNdx = (barrierNdx + 1)%2;  /* toggle */
+    numArrived2[barrierNdx] = 0; /* reset other count */
+
+#ifndef DEBUG
+    pthread_cond_broadcast(&go);
+#else
+    if( pthread_cond_broadcast(&go) != 0)
+      exit_with_msg_and_exit_code("pthread_cond_broadcast failed in finite_difference_multi_threaded.c",PTHREAD_COND_BROADCAST_FAILED);
+#endif
+  }
+  else
+  {
+    localNdx = barrierNdx; /* wait on "current" numArrived. */
+    while (numArrived2[localNdx] != number_of_workers)
+#ifndef DEBUG
+       pthread_cond_wait(&go, &barrier);
+#else
+       if( pthread_cond_wait(&go, &barrier) != 0)
+         exit_with_msg_and_exit_code("pthread_cond_wait failed finite_difference_multi_threaded.c",PTHREAD_COND_WAIT_FAILED);
+#endif
+  } 
+#ifndef DEBUG
+  pthread_mutex_unlock(&barrier);
+#else
+  if( pthread_mutex_unlock(&barrier) != 0 )
+    exit_with_msg_and_exit_code("pthread_mutex_unlock failed finite_difference_multi_threaded.c",PTHREAD_MUTEX_UNLOCK_FAILED);
+#endif
+}
+#endif
+
+
+
+/* Each Worker computes values in one strip of the grids.
+   The main worker loop does two computations to avoid copying from
+   one grid to the other.  */
+
+void *worker(void *arg) {
+  int myid = (int) arg;
+  double r_over_4,a,b,c,d,e,f,g,h;
+  double one_minus_r;
+  int i, j, iters, jstart;
+  int firstcol, lastcol;
+  int strip_width=width/number_of_workers;
+
+  firstcol = myid*strip_width+1;
+  lastcol = firstcol + strip_width - 1;
+  r_over_4=r/4.0;
+  one_minus_r=1-r;
+  if(myid == number_of_workers -1) 
+    lastcol=width-2;
+  Barrier();
+  for (iters = 1; iters <= ITERATIONS; iters++) 
+  {
+    for(i= firstcol ; i <= lastcol; ++i){
+      if(i%2 ==1 )
+       jstart=1;
+      else
+       jstart=2;
+      for(j=jstart ; j < height-1 ; j+=2){
+        if(oddity[i][j] == ORDINARY_INTERIOR_POINT) /* Same dielectric all around */
+          Vij[i][j]=r_over_4*(Vij[i][j+1]+Vij[i+1][j]+Vij[i][j-1]+Vij[i-1][j])+one_minus_r*Vij[i][j];
+        else if(oddity[i][j] > ORDINARY_INTERIOR_POINT) /* only update dielectrics, not conductors */
+        {
+          a=(Er[i][j] * Er[i][j-1] * Vij[i][j-1])/(Er[i][j] + Er[i][j-1]);
+          b=(Er[i][j] * Er[i][j+1] * Vij[i][j+1])/(Er[i][j] + Er[i][j+1]);
+          c=(Er[i][j] * Er[i-1][j] * Vij[i-1][j])/(Er[i][j] + Er[i-1][j]);
+          d=(Er[i][j] * Er[i+1][j] * Vij[i+1][j])/(Er[i][j] + Er[i+1][j]);
+    
+          e=(Er[i][j] * Er[i][j-1])/(Er[i][j]+Er[i][j-1]);
+          f=(Er[i][j] * Er[i][j+1])/(Er[i][j]+Er[i][j+1]);
+          g=(Er[i][j] * Er[i-1][j])/(Er[i][j]+Er[i-1][j]);
+          h=(Er[i][j] * Er[i+1][j])/(Er[i][j]+Er[i+1][j]);
+                        
+          Vij[i][j]=r*(a+b+c+d)/(e+f+g+h) + (1-r)*Vij[i][j];
+        }
+      }
+    }
+    Barrier();
+    for(i= firstcol ; i <= lastcol; ++i){
+      if(i%2 ==1 )
+       jstart=2;
+      else
+       jstart=1;
+      for(j=jstart ; j < height -1; j+=2){
+        if(oddity[i][j] == ORDINARY_INTERIOR_POINT) /* Same dielectric all around */
+          Vij[i][j]=r_over_4*(Vij[i][j+1]+Vij[i+1][j]+Vij[i][j-1]+Vij[i-1][j])+one_minus_r*Vij[i][j];
+        else if(oddity[i][j] > ORDINARY_INTERIOR_POINT) /* only update dielectrics, not conductors */
+        {
+          a=(Er[i][j] * Er[i][j-1] * Vij[i][j-1])/(Er[i][j] + Er[i][j-1]);
+          b=(Er[i][j] * Er[i][j+1] * Vij[i][j+1])/(Er[i][j] + Er[i][j+1]);
+          c=(Er[i][j] * Er[i-1][j] * Vij[i-1][j])/(Er[i][j] + Er[i-1][j]);
+          d=(Er[i][j] * Er[i+1][j] * Vij[i+1][j])/(Er[i][j] + Er[i+1][j]);
+    
+          e=(Er[i][j] * Er[i][j-1])/(Er[i][j]+Er[i][j-1]);
+          f=(Er[i][j] * Er[i][j+1])/(Er[i][j]+Er[i][j+1]);
+          g=(Er[i][j] * Er[i-1][j])/(Er[i][j]+Er[i-1][j]);
+          h=(Er[i][j] * Er[i+1][j])/(Er[i][j]+Er[i+1][j]);
+                        
+          Vij[i][j]=r*(a+b+c+d)/(e+f+g+h) + (1-r)*Vij[i][j];
+        }
+      }
+    }
+    Barrier();
+  }
+  Barrier();
+  return(0);
+}
+
+double finite_difference_multi_threaded()
+{
+  int i, j, ret, thread_number;
+  pthread_t thread_id[MAXIMUM_PROCESSING_DEVICES];
+
+
+  double capacitance_per_metre, energy_per_metre;
+
+
+  /* initialize mutex and condition variable */
+  pthread_mutex_init(&barrier, NULL);
+  pthread_cond_init(&go, NULL);
+  for(thread_number=0;thread_number<number_of_workers;thread_number++){   
+    ret=pthread_create(&thread_id[thread_number],NULL, worker,(void *)thread_number);
+    if(ret != 0)
+      exit_with_msg_and_exit_code("failed to create thread in finite_difference_multi_threaded.c",THREAD_CREATION_FAILED);
+  } 
+  /* Wait for each thread to join - i.e. once they are all finished. */
+  for(thread_number=0;thread_number<number_of_workers;thread_number++)
+  {
+    ret=pthread_join(thread_id[thread_number], NULL);
+    if(ret != 0)
+      exit_with_msg_and_exit_code("failed to join thread in finite_difference_multi_threaded.c",THREAD_FAILED_TO_JOIN);
+      }
+  energy_per_metre=0.0;
+  /*
+  The following code is wrong, but worked on all systems apart from AIX !!
+
+  for(i=0;i<width;++i)
+    for(j=0;j<height;++j)
+       energy_per_metre+=find_energy_per_metre(i,j);
+       */
+
+  for(i=1;i<width-1;++i)
+    for(j=1;j<height-1;++j)
+       energy_per_metre+=find_energy_per_metre(i,j);
+  if(coupler==FALSE)
+    capacitance_per_metre=2*energy_per_metre;
+  else 
+    capacitance_per_metre=energy_per_metre;
+  return(capacitance_per_metre);
+}
+#endif