Commit b465350d authored by Philippe Helluy's avatar Philippe Helluy

bug in 1d transport

parent f240b67c
#define _DEG 2
#define _RAF 3
#define _M 2
#define _DEG THE_DEG
#define _RAF THE_RAF
#define _M THE_M
#define _NP (_DEG + 1)
......@@ -103,7 +103,7 @@ __constant float gauss_lob_dpsi[] = {
-5.33333333333333333333333333336,
3.49148624377587810025755976667,
0,
-3.49148624377587810025755976662,
-3.49148624377587810025755976662,
5.33333333333333333333333333336,
2.82032835588485332564727827399,
-1.52752523165194666886268239791,
......@@ -143,9 +143,12 @@ int Varindex(int ipg, int iv) {
return ipg + npg * iv;
}
__constant float vit[3]={1, 0, 0};
void NumFlux(float *wL, float *wR, float *vnorm, float *flux)
{
float vn = sqrt(0.5) * vnorm[0] + sqrt(0.5) * vnorm[1];
float vn = vit[0] * vnorm[0] + vit[1] * vnorm[1] + vit[2] * vnorm[2];
float vnp = vn > 0.0 ? vn : 0.0;
float vnm = vn - vnp;
flux[0] = vnp * wL[0] + vnm * wR[0];
......@@ -154,15 +157,19 @@ void NumFlux(float *wL, float *wR, float *vnorm, float *flux)
vnm = vn - vnp;
flux[1] = vnp * wL[1] + vnm * wR[1];
}
void ImposedData(const float *x, const float t, float* w)
{
float vx = sqrt(0.5) * x[0] + sqrt(0.5) * x[1];
float vx = vit[0] * x[0] + vit[1] * x[1] + vit[2] * x[2];
float xx = vx - t;
w[0] = xx * xx;
w[0] = sin(xx);
float pi = 3.1415926535897932384626433832795028841971694;
//w[0] = xx * xx;
w[0] = sin(2*pi*xx);
w[0]=xx;
xx = vx + t;
w[1] = xx * xx;
w[1] = sin(xx);
//w[1] = xx * xx;
w[1] = sin(2*pi*xx);
w[1]=xx;
}
......@@ -222,19 +229,160 @@ float t30 = t2 * t16;
float t32 = x * t2;
float t34 = x * y;
float t36 = y * t16;
dtau[0][0] = -t3 * p[0] + t3 * p[3] - t6 * p[6] + t6 * p[9] + t9 * p[12] - t9 * p[15] + t12 * p[18] - t12 * p[21];
dtau[0][1] = -t17 * p[0] + t19 * p[3] - t19 * p[6] + t17 * p[9] + t23 * p[12] - t25 * p[15] + t25 * p[18] - t23 * p[21];
dtau[0][2] = -t30 * p[0] + t32 * p[3] - t34 * p[6] + t36 * p[9] + t30 * p[12] - t32 * p[15] + t34 * p[18] - t36 * p[21];
dtau[1][0] = -t3 * p[1] + t3 * p[4] - t6 * p[7] + t6 * p[10] + t9 * p[13] - t9 * p[16] + t12 * p[19] - t12 * p[22];
dtau[1][1] = -t17 * p[1] + t19 * p[4] - t19 * p[7] + t17 * p[10] + t23 * p[13] - t25 * p[16] + t25 * p[19] - t23 * p[22];
dtau[1][2] = -t30 * p[1] + t32 * p[4] - t34 * p[7] + t36 * p[10] + t30 * p[13] - t32 * p[16] + t34 * p[19] - t36 * p[22];
dtau[2][0] = -t3 * p[2] + t3 * p[5] - t6 * p[8] + t6 * p[11] + t9 * p[14] - t9 * p[17] + t12 * p[20] - t12 * p[23];
dtau[2][1] = -t17 * p[2] + t19 * p[5] - t19 * p[8] + t17 * p[11] + t23 * p[14] - t25 * p[17] + t25 * p[20] - t23 * p[23];
dtau[2][2] = -t30 * p[2] + t32 * p[5] - t34 * p[8] + t36 * p[11] + t30 * p[14] - t32 * p[17] + t34 * p[20] - t36 * p[23];
dtau[0][0] = -t3 * p[0] + t3 * p[3] - t6 * p[6] + t6 * p[9] +
t9 * p[12] - t9 * p[15] + t12 * p[18] - t12 * p[21];
dtau[0][1] = -t17 * p[0] + t19 * p[3] - t19 * p[6] + t17 * p[9]
+ t23 * p[12] - t25 * p[15] + t25 * p[18] - t23 * p[21];
dtau[0][2] = -t30 * p[0] + t32 * p[3] - t34 * p[6] + t36 * p[9]
+ t30 * p[12] - t32 * p[15] + t34 * p[18] - t36 * p[21];
dtau[1][0] = -t3 * p[1] + t3 * p[4] - t6 * p[7] + t6 * p[10]
+ t9 * p[13] - t9 * p[16] + t12 * p[19] - t12 * p[22];
dtau[1][1] = -t17 * p[1] + t19 * p[4] - t19 * p[7] + t17 * p[10]
+ t23 * p[13] - t25 * p[16] + t25 * p[19] - t23 * p[22];
dtau[1][2] = -t30 * p[1] + t32 * p[4] - t34 * p[7] + t36 * p[10]
+ t30 * p[13] - t32 * p[16] + t34 * p[19] - t36 * p[22];
dtau[2][0] = -t3 * p[2] + t3 * p[5] - t6 * p[8] + t6 * p[11]
+ t9 * p[14] - t9 * p[17] + t12 * p[20] - t12 * p[23];
dtau[2][1] = -t17 * p[2] + t19 * p[5] - t19 * p[8] + t17 * p[11]
+ t23 * p[14] - t25 * p[17] + t25 * p[20] - t23 * p[23];
dtau[2][2] = -t30 * p[2] + t32 * p[5] - t34 * p[8] + t36 * p[11]
+ t30 * p[14] - t32 * p[17] + t34 * p[20] - t36 * p[23];
/* dtau[0][0] = -(-1 + z) * (-1 + y) * p[0] + (-1 + z) * (-1 + y) * p[3] - y * (-1 + z) * p[6] + y * (-1 + z) * p[9] + z * (-1 + y) * p[12] - z * (-1 + y) * p[15] + y * z * p[18] - y * z * p[21]; */
/* dtau[0][1] = -(-1 + z) * (-1 + x) * p[0] + x * (-1 + z) * p[3] - x * (-1 + z) * p[6] + (-1 + z) * (-1 + x) * p[9] + z * (-1 + x) * p[12] - x * z * p[15] + x * z * p[18] - z * (-1 + x) * p[21]; */
/* dtau[0][2] = -(-1 + y) * (-1 + x) * p[0] + x * (-1 + y) * p[3] - x * y * p[6] + y * (-1 + x) * p[9] + (-1 + y) * (-1 + x) * p[12] - x * (-1 + y) * p[15] + x * y * p[18] - y * (-1 + x) * p[21]; */
/* dtau[1][0] = -(-1 + z) * (-1 + y) * p[1] + (-1 + z) * (-1 + y) * p[4] - y * (-1 + z) * p[7] + y * (-1 + z) * p[10] + z * (-1 + y) * p[13] - z * (-1 + y) * p[16] + y * z * p[19] - y * z * p[22]; */
/* dtau[1][1] = -(-1 + z) * (-1 + x) * p[1] + x * (-1 + z) * p[4] - x * (-1 + z) * p[7] + (-1 + z) * (-1 + x) * p[10] + z * (-1 + x) * p[13] - x * z * p[16] + x * z * p[19] - z * (-1 + x) * p[22]; */
/* dtau[1][2] = -(-1 + y) * (-1 + x) * p[1] + x * (-1 + y) * p[4] - x * y * p[7] + y * (-1 + x) * p[10] + (-1 + y) * (-1 + x) * p[13] - x * (-1 + y) * p[16] + x * y * p[19] - y * (-1 + x) * p[22]; */
/* dtau[2][0] = -(-1 + z) * (-1 + y) * p[2] + (-1 + z) * (-1 + y) * p[5] - y * (-1 + z) * p[8] + y * (-1 + z) * p[11] + z * (-1 + y) * p[14] - z * (-1 + y) * p[17] + y * z * p[20] - y * z * p[23]; */
/* dtau[2][1] = -(-1 + z) * (-1 + x) * p[2] + x * (-1 + z) * p[5] - x * (-1 + z) * p[8] + (-1 + z) * (-1 + x) * p[11] + z * (-1 + x) * p[14] - x * z * p[17] + x * z * p[20] - z * (-1 + x) * p[23]; */
/* dtau[2][2] = -(-1 + y) * (-1 + x) * p[2] + x * (-1 + y) * p[5] - x * y * p[8] + y * (-1 + x) * p[11] + (-1 + y) * (-1 + x) * p[14] - x * (-1 + y) * p[17] + x * y * p[20] - y * (-1 + x) * p[23]; */
/* dtau[0][0] = 1; */
/* dtau[0][1] = 0; */
/* dtau[0][2] = 0; */
/* dtau[1][0] = 0; */
/* dtau[1][1] = 1; */
/* dtau[1][2] = 0; */
/* dtau[2][0] = 0; */
/* dtau[2][1] = 0; */
/* dtau[2][2] = 1; */
}
__kernel
void get_nodes(int ie, // 1: macrocel index
__constant float *physnodes, // 2: macrocell nodes
__global float *node // 3: node coordinates
)
{
__constant float *physnode = physnodes + ie * 3 * _NHEXA;
int ix = get_global_id(0);
int iy = get_global_id(1);
int iz = get_global_id(2);
//int woffset = ie * _M * _NPG_MACROCELL;
// subcell id
int icL[3];
icL[0] = ix / _NP;
icL[1] = iy / _NP;
icL[2] = iz / _NP;
int pL[3];
pL[0] = ix % _NP;
pL[1] = iy % _NP;
pL[2] = iz % _NP;
// ref coordinates
float hx = 1. / (float) _RAF;
float hy = 1. / (float) _RAF;
float hz = 1. / (float) _RAF;
int offset[3] = {gauss_lob_offset[_DEG] + pL[0],
gauss_lob_offset[_DEG] + pL[1],
gauss_lob_offset[_DEG] + pL[2]};
float x = hx * (icL[0] + gauss_lob_point[offset[0]]);
float y = hy * (icL[1] + gauss_lob_point[offset[1]]);
float z = hz * (icL[2] + gauss_lob_point[offset[2]]);
int ipgL = pL[0] + _NP * icL[0] + _NPG_ROW * (iy + _NPG_ROW * iz);
float xphy[3];
get_tau(x, y, z, physnode, xphy);
node[ipgL + 0 * _NPG_MACROCELL] = xphy[0];
node[ipgL + 1 * _NPG_MACROCELL] = xphy[1];
node[ipgL + 2 * _NPG_MACROCELL] = xphy[2];
/* printf("ipgL=%d ijk=%d %d %d xyz=%f %f %f xphy=%f %f %f\n", */
/* ipgL,ix,iy,iz, */
/* x,y,z,xphy[0],xphy[1],xphy[2]); */
}
__kernel
void Init(int ie, // 1: macrocel index
__constant float *physnodes, // 2: macrocell nodes
__global float *wn // 3: field values
)
{
__constant float *physnode = physnodes + ie * 3 * _NHEXA;
int ix = get_global_id(0);
int iy = get_global_id(1);
int iz = get_global_id(2);
int woffset = ie * _M * _NPG_MACROCELL;
// subcell id
int icL[3];
icL[0] = ix / _NP;
icL[1] = iy / _NP;
icL[2] = iz / _NP;
int pL[3];
pL[0] = ix % _NP;
pL[1] = iy % _NP;
pL[2] = iz % _NP;
// ref coordinates
float hx = 1. / _RAF;
float hy = 1. / _RAF;
float hz = 1. / _RAF;
int offset[3] = {gauss_lob_offset[_DEG] + pL[0],
gauss_lob_offset[_DEG] + pL[1],
gauss_lob_offset[_DEG] + pL[2]};
float x = hx * (icL[0] + gauss_lob_point[offset[0]]);
float y = hy * (icL[1] + gauss_lob_point[offset[1]]);
float z = hz * (icL[2] + gauss_lob_point[offset[2]]);
int ipgL = pL[0] + _NP * icL[0] + _NPG_ROW * (iy + _NPG_ROW * iz);
float xphy[3];
get_tau(x, y, z, physnode, xphy);
float wL[_M];
float tim = 0;
ImposedData(xphy, tim, wL);
for(int iv = 0; iv < _M; iv++) {
int imemL = Varindex(ipgL, iv) + woffset;
wn[imemL] = wL[iv];
//wn[imemL] = ix;
}
}
__kernel
void DGVolume(int ie, // 1: macrocel index
......@@ -254,14 +402,16 @@ void DGVolume(int ie, // 1: macrocel index
// subcell id
int icL[3];
icL[0] = ix / _RAF;
icL[1] = iy / _RAF;
icL[2] = iz / _RAF;
icL[0] = ix / _NP;
icL[1] = iy / _NP;
icL[2] = iz / _NP;
int p[3];
p[0] = ix % _RAF;
p[1] = iy % _RAF;
p[2] = iz % _RAF;
p[0] = ix % _NP;
p[1] = iy % _NP;
p[2] = iz % _NP;
// ref coordinates
......@@ -285,8 +435,17 @@ void DGVolume(int ie, // 1: macrocel index
float codtau[3][3];
{
float dtau[3][3];
//printf("xyz=%f %f %f\n",x,y,z);
get_dtau(x, y, z, physnode, dtau); // 1296 mults
/* printf("dtau=\n%f %f %f \n%f %f %f \n%f %f %f\n", */
/* dtau[0][0],dtau[0][1],dtau[0][2], */
/* dtau[1][0],dtau[1][1],dtau[1][2], */
/* dtau[2][0],dtau[2][1],dtau[2][2]); */
/* for(int ii=0;ii < 8;ii++){ */
/* printf("physnode=%f %f %f\n", */
/* physnode[3*ii+0],physnode[3*ii+1],physnode[3*ii+2]); */
/* } */
codtau[0][0] = dtau[1][1] * dtau[2][2] - dtau[1][2] * dtau[2][1];
codtau[0][1] = -dtau[1][0] * dtau[2][2] + dtau[1][2] * dtau[2][0];
codtau[0][2] = dtau[1][0] * dtau[2][1] - dtau[1][1] * dtau[2][0];
......@@ -324,21 +483,32 @@ void DGVolume(int ie, // 1: macrocel index
}
NumFlux(wL, wL, dphi, flux); // 3m mults when using NumFlux
int ipgR = (ix + iq) % _NP + _NPG_ROW * (iy + _NPG_ROW * iz);
for(int iv = 0; iv < _M; iv++) {
int imemR = Varindex(ipgR, iv) + woffset;
printf("imemR=%d wpg=%f\n",imemR,wpg);
dtwn[imemR] += flux[iv] * wpg;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for(int iv = 0; iv < _M; iv++){
int imemL = Varindex(ipgL, iv) + woffset;
dtwn[imemL] /= wpg;
}
}
// Compute the surface terms inside one macrocell
__kernel
void DGFlux(int ie, // 1: macrocell index
__constant float *physnodes, // 3: macrocell nodes
__constant float *tnow,
float tnow,
__global float *wn, // 4: field values
__global float *dtwn // 5: time derivative
)
......@@ -355,13 +525,13 @@ void DGFlux(int ie, // 1: macrocell index
// subcell id
int icL[3];
icL[0] = ifx - 1;
icL[1] = iy / _RAF;
icL[2] = iz / _RAF;
icL[1] = iy / _NP;
icL[2] = iz / _NP;
int pL[3];
pL[0] = _RAF - 1;
pL[1] = iy % _RAF;
pL[2] = iz % _RAF;
pL[0] = _NP - 1;
pL[1] = iy % _NP;
pL[2] = iz % _NP;
int icR[3];
......@@ -371,20 +541,32 @@ void DGFlux(int ie, // 1: macrocell index
int pR[3];
pR[0] = 0;
pR[1] = iy % _RAF;
pR[2] = iz % _RAF;
pR[1] = iy % _NP;
pR[2] = iz % _NP;
float hx = 1.0 / _RAF;
float hy = 1.0 / _RAF;
float hz = 1.0 / _RAF;
int offset[3] = {gauss_lob_offset[_DEG] + pL[0],
gauss_lob_offset[_DEG] + pL[1],
gauss_lob_offset[_DEG] + pL[2]};
float x = hx * (icL[0] + gauss_lob_point[offset[0]]);
float y = hy * (icL[1] + gauss_lob_point[offset[1]]);
float z = hz * (icL[2] + gauss_lob_point[offset[2]]);
int offset[3];
float x,y,z;
//if (ifx > 0){
offset[0]=gauss_lob_offset[_DEG] + pL[0];
offset[1]=gauss_lob_offset[_DEG] + pL[1];
offset[2]=gauss_lob_offset[_DEG] + pL[2];
x = hx * (icL[0] + gauss_lob_point[offset[0]]);
y = hy * (icL[1] + gauss_lob_point[offset[1]]);
z = hz * (icL[2] + gauss_lob_point[offset[2]]);
/* } else { */
/* //printf("ifx=%d pR=%d %d %d\n",ifx,pR[0],pR[1],pR[2]); */
/* offset[0]=gauss_lob_offset[_DEG] + pR[0]; */
/* offset[1]=gauss_lob_offset[_DEG] + pR[1]; */
/* offset[2]=gauss_lob_offset[_DEG] + pR[2]; */
/* x = hx * (icR[0] + gauss_lob_point[offset[0]]); */
/* y = hy * (icR[1] + gauss_lob_point[offset[1]]); */
/* z = hz * (icR[2] + gauss_lob_point[offset[2]]); */
/* } */
float codtau[3][3];
{
......@@ -410,19 +592,17 @@ void DGFlux(int ie, // 1: macrocell index
int ipgL = -1;
if (ifx > 0)
ipgL = pL[0] + _RAF * icL[0] + _NPG_ROW * (iy + _NPG_ROW * iz);
ipgL = pL[0] + _NP * icL[0] + _NPG_ROW * (iy + _NPG_ROW * iz);
int ipgR = -1;
if (ifx < _RAF)
ipgR = pR[0] + _RAF * icR[0] + _NPG_ROW * (iy + _NPG_ROW * iz);
ipgR = pR[0] + _NP * icR[0] + _NPG_ROW * (iy + _NPG_ROW * iz);
float xphy[3];
if (ipgL < 0 || ipgR < 0){
get_tau(x, y, z, physnode, xphy);
}
get_tau(x, y, z, physnode, xphy);
float wL[_M], wR[_M];
float tim = *tnow;
float tim = tnow;
for(int iv = 0; iv < _M; iv++) {
if (ipgL >= 0){
int imemL = Varindex(ipgL, iv) + woffset;
......@@ -438,23 +618,22 @@ void DGFlux(int ie, // 1: macrocell index
}
}
// TODO: test ipgL and ipgR for Boundary terms !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
float flux[_M];
NumFlux(wL, wR, vnds, flux); //
//printf("wL=%f %f\n",wL[0],wL[1]);
float wpgs = wglop(_DEG, pL[1]) * wglop(_DEG, pL[2]);
for(int iv = 0; iv < _M; ++iv) {
if (ipgL >= 0){
int imemL = Varindex(ipgL, iv) + woffset;
printf("imemL=%d wpgs=%f\n",imemL,wpgs);
dtwn[imemL] -= flux[iv] * wpgs;
}
if (ipgR >= 0){
int imemR = Varindex(ipgR, iv) + woffset;
printf("imemR=%d\n",imemR);
dtwn[imemR] += flux[iv] * wpgs;
}
}
......
......@@ -40,9 +40,9 @@ verif_cpu = np.fromfunction(lambda i: i * i + i, (taille, ), dtype = np.float32)
prg = cl.Program(ctx, source).build()
prg.K1(queue, (taille, ), None, x_gpu).wait()
event = prg.K1(queue, (taille, ), None, x_gpu)
prg.K2(queue, (taille, ), None, y_gpu).wait()
prg.K3(queue, (taille, ), None, x_gpu, y_gpu, z_gpu).wait()
prg.K3(queue, (taille, ), None , x_gpu, y_gpu, z_gpu, wait_for = [event]).wait()
cl.enqueue_copy(queue, z_cpu, z_gpu)
......
......@@ -5,23 +5,31 @@ from __future__ import absolute_import, print_function
import pyopencl as cl
import numpy as np
with open('kernels.cl', 'r') as f:
source = f.read()
ctx = cl.create_some_context()
m = 2
raf = 1
deg = 1
source = source.replace("THE_DEG", str(deg))
source = source.replace("THE_RAF", str(raf))
source = source.replace("THE_M", str(m))
ctx = cl.create_some_context()
prg = cl.Program(ctx, source).build()
point = [ ( 0.0, 0.0, 0.0),
( 1.0, 0.0, 0.0),
( 1.0, 1.0, 0.0),
( 0.0, 1.0, 0.0),
( 0.0, 0.0, 1.0),
( 1.0, 0.0, 1.0),
( 1.0, 1.0, 1.0),
( 0.0, 1.0, 1.0)]
point_cpu = np.array([0.0, 0.0, 0.0,
1.0, 0.0, 0.0,
1.0, 1.0, 0.0,
0.0, 1.0, 0.0,
0.0, 0.0, 1.0,
1.0, 0.0, 1.0,
1.0, 1.0, 1.0,
0.0, 1.0, 1.0], dtype = np.float32)
element = [0,1,2,3,4,5,6,7]
......@@ -32,4 +40,56 @@ face2node = [[0,1,5,4],
[5,6,7,4],
[0,3,2,1]]
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
wsize = m * raf**3 * (deg+1)**3
row_size = raf * (deg + 1)
point_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = point_cpu)
wn_cpu = np.zeros(wsize, dtype = np.float32)
wn_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, size=(wsize * np.dtype('float32').itemsize))
dtwn_cpu = np.zeros(wsize, dtype = np.float32)
dtwn_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = dtwn_cpu);
ie = np.int32(0)
prg.Init(queue, (row_size, row_size, row_size), None, ie, point_gpu, wn_gpu).wait()
cl.enqueue_copy(queue, wn_cpu, wn_gpu)
#print(wn_cpu)
wn_out = wn_cpu.reshape((2*row_size**2,row_size))
#print(wn_out)
import matplotlib.pyplot as plt
x_cpu = np.zeros(3 * row_size**3, dtype = np.float32)
x_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, x_cpu.nbytes)
prg.get_nodes(queue, (row_size, row_size, row_size),
None, ie, point_gpu, x_gpu).wait()
cl.enqueue_copy(queue, x_cpu, x_gpu).wait()
plt.plot(x_cpu[0:row_size],wn_out[0])
plt.plot(x_cpu[0:row_size],wn_out[1])
#plt.show()
#print(x_cpu[0:row_size])
#print(x_cpu[0:row_size])
tnow = np.float32(0)
npts = deg+1
prg.DGFlux(queue, (raf+1, row_size, row_size), (1, 1, 1), ie, point_gpu,tnow, wn_gpu,dtwn_gpu).wait()
prg.DGVolume(queue, (row_size, row_size, row_size), (npts,npts,npts), ie, point_gpu, wn_gpu,dtwn_gpu).wait()
cl.enqueue_copy(queue, dtwn_cpu, dtwn_gpu)
print(dtwn_cpu)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment