diff --git a/lbm_cl.py b/lbm_cl.py index 722fe42c947a165882df5537d3a6fd72fd39fb46..e91f872c12f81fb894f9f7d15eb578b83b87267c 100755 --- a/lbm_cl.py +++ b/lbm_cl.py @@ -18,23 +18,21 @@ import time ##################" definition of default values # number of conservative variables -_m = 2 +_m = 9 # number of kinetic variables _n = 4 * _m -_ivplot = 1 +_ivplot = 0 # grid size -_nx = 64 -_ny = 64 - -Lx = 2 * 3.14159265358979323846264338328 -Ly = 2 * 3.14159265358979323846264338328 +_nx = 256 +_ny = 256 Lx = 1 Ly = 1 + _dx = Lx / _nx _dy = Ly / _ny @@ -42,12 +40,12 @@ _dy = Ly / _ny vel = np.array([1., 1.]) # lattice speed -_vmax = 7. +_vmax = 20. # time stepping _Tmax = 10. / _vmax -#_Tmax = 0. +_Tmax = 2. cfl = 1 _dt = cfl * _dx / _vmax @@ -61,20 +59,24 @@ def solve_ocl(m = _m, n = _n, nx = _nx, ny = _ny, dt = _dt, animate = False): + + ff = "_F" + # load and adjust C program source = open("lbm_kernels.cl", "r").read() source = source.replace("_nx_", "("+str(nx)+")") source = source.replace("_ny_", "("+str(ny)+")") - source = source.replace("_dx_", "("+str(dx)+"f)") - source = source.replace("_dy_", "("+str(dy)+"f)") - source = source.replace("_dt_", "("+str(dt)+"f)") + source = source.replace("_dx_", "("+str(dx)+ ff + ")") + source = source.replace("_dy_", "("+str(dy)+ ff + ")") + source = source.replace("_dt_", "("+str(dt)+ ff + ")") source = source.replace("_m_", "("+str(m)+")") source = source.replace("_n_", "("+str(n)+")") - source = source.replace("_vx_", "("+str(vel[0])+"f)") - source = source.replace("_vy_", "("+str(vel[1])+"f)") - source = source.replace("_lambda_", "("+str(vmax)+"f)") + source = source.replace("_vx_", "("+str(vel[0])+ ff + ")") + source = source.replace("_vy_", "("+str(vel[1])+ ff + ")") + source = source.replace("_lambda_", "("+str(vmax)+ ff + ")") - #print(source) + source = source.replace("_F", "") + print(source) #exit(0) @@ -88,9 +90,9 @@ def solve_ocl(m = _m, n = _n, nx = _nx, ny = _ny, # create OpenCL buffers fn_gpu = cl.Buffer(ctx, mf.READ_WRITE, - size=(4 * m * nx * ny * np.dtype('float32').itemsize)) + size=(4 * m * nx * ny * np.dtype('float64').itemsize)) fnp1_gpu = cl.Buffer(ctx, mf.READ_WRITE, - size=(4 * m * nx * ny * np.dtype('float32').itemsize)) + size=(4 * m * nx * ny * np.dtype('float64').itemsize)) # create a queue (for submitting opencl operations) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) @@ -109,7 +111,7 @@ def solve_ocl(m = _m, n = _n, nx = _nx, ny = _ny, t = 0 iter = 0 elapsed = 0.; - fn_cpu = np.empty((4 * m * nx * ny, ), dtype = np.float32) + fn_cpu = np.empty((4 * m * nx * ny, ), dtype = np.float64) print("start OpenCL computations...") while t < Tmax: diff --git a/lbm_kernels.cl b/lbm_kernels.cl index 76b517b5d67fc308a1faa84f24e403473a50d589..386d4946b3ee5da32dc1ca43da48698821ea34d6 100644 --- a/lbm_kernels.cl +++ b/lbm_kernels.cl @@ -1,3 +1,8 @@ + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define real double + #define _NX _nx_ #define _NY _ny_ #define _DX _dx_ @@ -10,65 +15,66 @@ #define _LAMBDA _lambda_ +#define M_PI (3.14159265358979323846264338328_F) + #define _VOL (_DX * _DY) __constant int dir[4][2] = { {-1, 0}, {1, 0}, {0, -1}, {0, 1}}; -__constant float ds[4] = { _DY, _DY, _DX, _DX }; - +__constant real ds[4] = { _DY, _DY, _DX, _DX }; // physical flux of the hyperbolic system -void flux_phy(const float *w, const float* vnorm, float* flux){ - float vdotn = _VX * vnorm[0] + _VY * vnorm[1]; - int signe = 1; - for(int iv = 0; iv < _M; iv++){ - flux[iv] = vdotn * w[iv] * signe; - signe = -signe; - } -} +/* void flux_phy(const real *w, const real* vnorm, real* flux){ */ +/* real vdotn = _VX * vnorm[0] + _VY * vnorm[1]; */ +/* int signe = 1; */ +/* for(int iv = 0; iv < _M; iv++){ */ +/* flux[iv] = vdotn * w[iv] * signe; */ +/* signe = -signe; */ +/* } */ +/* } */ -/* void flux_phy(const float *W, const float *vn, float *flux) { */ +void flux_phy(const real *W, const real *vn, real *flux) { -/* float gam = 1.6666666666f; */ + real gam = 1.6666666666_F; -/* float un = W[1]/W[0]*vn[0]+W[3]/W[0]*vn[1]+W[4]/W[0]*vn[2]; */ -/* float bn = W[7]*vn[0]+W[5]*vn[1]+W[6]*vn[2]; */ + real un = (W[1] * vn[0] + W[3] * vn[1] + W[4] * vn[2]) / W[0]; + real bn = W[7] * vn[0] + W[5] * vn[1] + W[6] * vn[2]; -/* float p = (gam-1)*(W[2] - W[0]*(W[1]/W[0]*W[1]/W[0] */ -/* + W[3]/W[0]*W[3]/W[0] */ -/* + W[4]/W[0]*W[4]/W[0])/2 */ -/* - (W[7]*W[7]+W[5]*W[5]+W[6]*W[6])/2); */ + real p = (gam - 1) * (W[2] + - (W[1] * W[1] + W[3] * W[3] + + W[4] * W[4]) / 2 / W[0] + - (W[7] * W[7] + W[5] * W[5] + W[6] * W[6]) /2); -/* flux[0] = W[0]*un; */ -/* flux[1] = W[0]*un*W[1]/W[0] + */ -/* (p + (W[7]*W[7] + W[5]*W[5] + W[6]*W[6])/2) */ -/* *vn[0] - bn*W[7]; */ -/* flux[2] = (W[2] + p + (W[7]*W[7] + W[5]*W[5] + W[6]*W[6])/2)*un */ -/* - (W[7]*W[1]/W[0] + W[5]*W[3]/W[0] + W[6]*W[4]/W[0])*bn; */ -/* flux[3] = W[0]*un*W[3]/W[0] + (p + (W[7]*W[7] + W[5]*W[5] */ -/* + W[6]*W[6])/2)*vn[1] - bn*W[5]; */ -/* flux[4] = W[0]*un*W[4]/W[0] + (p + (W[7]*W[7] + W[5]*W[5] */ -/* + W[6]*W[6])/2)*vn[2] - bn*W[6]; */ + flux[0] = W[0] * un; + flux[1] = un*W[1] + + (p + (W[7] * W[7] + W[5] * W[5] + W[6] * W[6]) / 2) + * vn[0] - bn * W[7]; + flux[2] = (W[2] + p + (W[7]*W[7] + W[5]*W[5] + W[6]*W[6])/2)*un + - (W[7]*W[1] + W[5]*W[3] + W[6]*W[4])*bn/W[0]; + flux[3] = un*W[3] + (p + (W[7]*W[7] + W[5]*W[5] + + W[6]*W[6])/2)*vn[1] - bn*W[5]; + flux[4] = un*W[4] + (p + (W[7]*W[7] + W[5]*W[5] + + W[6]*W[6])/2)*vn[2] - bn*W[6]; -/* flux[5] = -bn*W[3]/W[0] + un*W[5] + W[8]*vn[1]; */ -/* flux[6] = -bn*W[4]/W[0] + un*W[6] + W[8]*vn[2]; */ -/* flux[7] = -bn*W[1]/W[0] + un*W[7] + W[8]*vn[0]; */ + flux[5] = -bn*W[3]/W[0] + un*W[5] + W[8]*vn[1]; + flux[6] = -bn*W[4]/W[0] + un*W[6] + W[8]*vn[2]; + flux[7] = -bn*W[1]/W[0] + un*W[7] + W[8]*vn[0]; -/* flux[8] = 6*6*bn; */ -/* } */ + flux[8] = 6*6*bn; +} // equilibrium "maxwellian" from macro data w -void w2f(const float *w, float *f){ +void w2f(const real *w, real *f){ for(int d = 0; d < 4; d++){ - float flux[_M]; - float vnorm[2] = {dir[d][0], dir[d][1]}; + real flux[_M]; + real vnorm[2] = {dir[d][0], dir[d][1]}; flux_phy(w, vnorm, flux); for(int iv = 0; iv < _M; iv++){ f[d * _M + iv] = w[iv] / 4 + flux[iv] / 2 / _LAMBDA; @@ -78,7 +84,7 @@ void w2f(const float *w, float *f){ // macro data w from micro data f -void f2w(const float *f, float *w){ +void f2w(const real *f, real *w){ for(int iv = 0; iv < _M; iv++) w[iv] = 0; for(int d = 0; d < 4; d++){ for(int iv = 0; iv < _M; iv++){ @@ -90,56 +96,56 @@ void f2w(const float *f, float *w){ // exact macro data w // transport -void exact_sol(float* xy, float t, float* w){ - int signe = 1; - for(int iv = 0; iv < _M; iv++){ - float x = xy[0] - t * _VX * signe - 0.5f; - float y = xy[1] - t * _VY * signe - 0.5f; - float d2 = x * x + y *y; - w[iv] = exp(-30*d2); - signe = -signe; - } -} +/* void exact_sol(real* xy, real t, real* w){ */ +/* int signe = 1; */ +/* for(int iv = 0; iv < _M; iv++){ */ +/* real x = xy[0] - t * _VX * signe - 0.5_F; */ +/* real y = xy[1] - t * _VY * signe - 0.5_F; */ +/* real d2 = x * x + y *y; */ +/* w[iv] = exp(-30*d2); */ +/* signe = -signe; */ +/* } */ +/* } */ // mhd -/* void conservatives(float *y, float *w) { */ -/* float gam = 1.6666666666f; */ - -/* w[0] = y[0]; */ -/* w[1] = y[0]*y[1]; */ -/* w[2] = y[2]/(gam-1) + y[0]*(y[1]*y[1]+y[3]*y[3]+y[4]*y[4])/2 */ -/* + (y[7]*y[7]+y[5]*y[5]+y[6]*y[6])/2; */ -/* w[3] = y[0]*y[3]; */ -/* w[4] = y[0]*y[4]; */ -/* w[5] = y[5]; */ -/* w[6] = y[6]; */ -/* w[7] = y[7]; // Bx */ -/* w[8] = y[8]; // psi */ -/* } */ +void conservatives(real *y, real *w) { + real gam = 1.6666666666_F; + + w[0] = y[0]; // rho + w[1] = y[0]*y[1]; // rho u + w[2] = y[2]/(gam-1) + y[0]*(y[1]*y[1]+y[3]*y[3]+y[4]*y[4])/2 + + (y[7]*y[7]+y[5]*y[5]+y[6]*y[6])/2; // rho E + w[3] = y[0]*y[3]; // rho v + w[4] = y[0]*y[4]; // rho w + w[5] = y[5]; // By + w[6] = y[6]; // Bz + w[7] = y[7]; // Bx + w[8] = y[8]; // psi +} -/* void exact_sol(float* x, float t, float* w){ */ -/* float gam = 1.6666666666f; */ -/* float yL[9]; */ +void exact_sol(real* x, real t, real* w){ + real gam = 1.6666666666_F; + real yL[9]; -/* yL[0] = gam*gam; */ -/* yL[1] = -sin(x[1]); */ -/* yL[2] = gam; */ -/* yL[3] = sin(x[0]); */ -/* yL[4] = 0; */ -/* yL[5] = sin(2*(x[0])); */ -/* yL[6] = 0; */ -/* yL[7] = -sin(x[1]); */ -/* yL[8] = 0; */ + yL[0] = gam*gam; //rho + yL[1] = -sin(2 * M_PI * x[1]); // u + yL[2] = gam; // p + yL[3] = sin(2 * M_PI * x[0]); // v + yL[4] = 0; // w + yL[5] = sin(4 * M_PI * x[0]); // By + yL[6] = 0; // Bz + yL[7] = -sin(2 * M_PI * x[1]); // Bx + yL[8] = 0; // psi -/* conservatives(yL, w); */ + conservatives(yL, w); -/* } */ +} // initial condition on the macro data -__kernel void init_sol(__global float *fn){ +__kernel void init_sol(__global real *fn){ int id = get_global_id(0); @@ -148,14 +154,14 @@ __kernel void init_sol(__global float *fn){ int ngrid = _NX * _NY; - float wnow[_M]; + real wnow[_M]; - float t = 0; - float xy[2] = {i * _DX + _DX / 2, j * _DY + _DY / 2}; + real t = 0; + real xy[2] = {i * _DX + _DX / 2, j * _DY + _DY / 2}; exact_sol(xy, t, wnow); - float fnow[_N]; + real fnow[_N]; w2f(wnow, fnow); //printf("x=%f, y=%f \n",xy[0],xy[1]); @@ -171,7 +177,7 @@ __kernel void init_sol(__global float *fn){ // one time step of the LBM scheme -__kernel void time_step(__global float *fn, __global float *fnp1){ +__kernel void time_step(__global real *fn, __global real *fnp1){ int id = get_global_id(0); @@ -181,7 +187,7 @@ __kernel void time_step(__global float *fn, __global float *fnp1){ int ngrid = _NX * _NY; - float fnow[_N]; + real fnow[_N]; // periodic shift in each direction for(int d = 0; d < 4; d++){ @@ -195,16 +201,16 @@ __kernel void time_step(__global float *fn, __global float *fnp1){ } } - float wnow[_M]; + real wnow[_M]; f2w(fnow, wnow); - float fnext[_N]; + real fnext[_N]; // first order relaxation w2f(wnow, fnext); // second order relaxation for(int ik = 0; ik < _N; ik++) - fnext[ik] = 2 * fnext[ik] - fnow[ik]; + fnext[ik] = 1.9_F * fnext[ik] - 0.9_F * fnow[ik]; // save for(int ik = 0; ik < _N; ik++){