Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
David Coulette
patapon
Commits
b465350d
Commit
b465350d
authored
Jul 18, 2016
by
Philippe Helluy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
bug in 1d transport
parent
f240b67c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
304 additions
and
65 deletions
+304
-65
kernels.cl
kernels.cl
+231
-52
simple_kernels.py
simple_kernels.py
+2
-2
testtranspose.py
testtranspose.py
+71
-11
No files found.
kernels.cl
View file @
b465350d
#
define
_DEG
2
#
define
_RAF
3
#
define
_M
2
#
define
_DEG
THE_DEG
#
define
_RAF
THE_RAF
#
define
_M
THE_M
#
define
_NP
(
_DEG
+
1
)
...
...
@@ -103,7 +103,7 @@ __constant float gauss_lob_dpsi[] = {
-5.33333333333333333333333333336
,
3.49148624377587810025755976667
,
0
,
-3.49148624377587810025755976662
,
-3.49148624377587810025755976662
,
5.33333333333333333333333333336
,
2.82032835588485332564727827399
,
-1.52752523165194666886268239791
,
...
...
@@ -143,9 +143,12 @@ int Varindex(int ipg, int iv) {
return
ipg
+
npg
*
iv
;
}
__constant
float
vit[3]={1,
0
,
0}
;
void
NumFlux
(
float
*wL,
float
*wR,
float
*vnorm,
float
*flux
)
{
float
vn
=
sqrt
(
0.5
)
*
vnorm[0]
+
sqrt
(
0.5
)
*
vnorm[1]
;
float
vn
=
vit[0]
*
vnorm[0]
+
vit[1]
*
vnorm[1]
+
vit[2]
*
vnorm[2]
;
float
vnp
=
vn
>
0.0
?
vn
:
0.0
;
float
vnm
=
vn
-
vnp
;
flux[0]
=
vnp
*
wL[0]
+
vnm
*
wR[0]
;
...
...
@@ -154,15 +157,19 @@ void NumFlux(float *wL, float *wR, float *vnorm, float *flux)
vnm
=
vn
-
vnp
;
flux[1]
=
vnp
*
wL[1]
+
vnm
*
wR[1]
;
}
void
ImposedData
(
const
float
*x,
const
float
t
,
float*
w
)
{
float
vx
=
sqrt
(
0.5
)
*
x[0]
+
sqrt
(
0.5
)
*
x[1
]
;
float
vx
=
vit[0]
*
x[0]
+
vit[1]
*
x[1]
+
vit[2]
*
x[2
]
;
float
xx
=
vx
-
t
;
w[0]
=
xx
*
xx
;
w[0]
=
sin
(
xx
)
;
float
pi
=
3.1415926535897932384626433832795028841971694
;
//w[0]
=
xx
*
xx
;
w[0]
=
sin
(
2*pi*xx
)
;
w[0]=xx
;
xx
=
vx
+
t
;
w[1]
=
xx
*
xx
;
w[1]
=
sin
(
xx
)
;
//w[1]
=
xx
*
xx
;
w[1]
=
sin
(
2*pi*xx
)
;
w[1]=xx
;
}
...
...
@@ -222,19 +229,160 @@ float t30 = t2 * t16;
float
t32
=
x
*
t2
;
float
t34
=
x
*
y
;
float
t36
=
y
*
t16
;
dtau[0][0]
=
-t3
*
p[0]
+
t3
*
p[3]
-
t6
*
p[6]
+
t6
*
p[9]
+
t9
*
p[12]
-
t9
*
p[15]
+
t12
*
p[18]
-
t12
*
p[21]
;
dtau[0][1]
=
-t17
*
p[0]
+
t19
*
p[3]
-
t19
*
p[6]
+
t17
*
p[9]
+
t23
*
p[12]
-
t25
*
p[15]
+
t25
*
p[18]
-
t23
*
p[21]
;
dtau[0][2]
=
-t30
*
p[0]
+
t32
*
p[3]
-
t34
*
p[6]
+
t36
*
p[9]
+
t30
*
p[12]
-
t32
*
p[15]
+
t34
*
p[18]
-
t36
*
p[21]
;
dtau[1][0]
=
-t3
*
p[1]
+
t3
*
p[4]
-
t6
*
p[7]
+
t6
*
p[10]
+
t9
*
p[13]
-
t9
*
p[16]
+
t12
*
p[19]
-
t12
*
p[22]
;
dtau[1][1]
=
-t17
*
p[1]
+
t19
*
p[4]
-
t19
*
p[7]
+
t17
*
p[10]
+
t23
*
p[13]
-
t25
*
p[16]
+
t25
*
p[19]
-
t23
*
p[22]
;
dtau[1][2]
=
-t30
*
p[1]
+
t32
*
p[4]
-
t34
*
p[7]
+
t36
*
p[10]
+
t30
*
p[13]
-
t32
*
p[16]
+
t34
*
p[19]
-
t36
*
p[22]
;
dtau[2][0]
=
-t3
*
p[2]
+
t3
*
p[5]
-
t6
*
p[8]
+
t6
*
p[11]
+
t9
*
p[14]
-
t9
*
p[17]
+
t12
*
p[20]
-
t12
*
p[23]
;
dtau[2][1]
=
-t17
*
p[2]
+
t19
*
p[5]
-
t19
*
p[8]
+
t17
*
p[11]
+
t23
*
p[14]
-
t25
*
p[17]
+
t25
*
p[20]
-
t23
*
p[23]
;
dtau[2][2]
=
-t30
*
p[2]
+
t32
*
p[5]
-
t34
*
p[8]
+
t36
*
p[11]
+
t30
*
p[14]
-
t32
*
p[17]
+
t34
*
p[20]
-
t36
*
p[23]
;
dtau[0][0]
=
-t3
*
p[0]
+
t3
*
p[3]
-
t6
*
p[6]
+
t6
*
p[9]
+
t9
*
p[12]
-
t9
*
p[15]
+
t12
*
p[18]
-
t12
*
p[21]
;
dtau[0][1]
=
-t17
*
p[0]
+
t19
*
p[3]
-
t19
*
p[6]
+
t17
*
p[9]
+
t23
*
p[12]
-
t25
*
p[15]
+
t25
*
p[18]
-
t23
*
p[21]
;
dtau[0][2]
=
-t30
*
p[0]
+
t32
*
p[3]
-
t34
*
p[6]
+
t36
*
p[9]
+
t30
*
p[12]
-
t32
*
p[15]
+
t34
*
p[18]
-
t36
*
p[21]
;
dtau[1][0]
=
-t3
*
p[1]
+
t3
*
p[4]
-
t6
*
p[7]
+
t6
*
p[10]
+
t9
*
p[13]
-
t9
*
p[16]
+
t12
*
p[19]
-
t12
*
p[22]
;
dtau[1][1]
=
-t17
*
p[1]
+
t19
*
p[4]
-
t19
*
p[7]
+
t17
*
p[10]
+
t23
*
p[13]
-
t25
*
p[16]
+
t25
*
p[19]
-
t23
*
p[22]
;
dtau[1][2]
=
-t30
*
p[1]
+
t32
*
p[4]
-
t34
*
p[7]
+
t36
*
p[10]
+
t30
*
p[13]
-
t32
*
p[16]
+
t34
*
p[19]
-
t36
*
p[22]
;
dtau[2][0]
=
-t3
*
p[2]
+
t3
*
p[5]
-
t6
*
p[8]
+
t6
*
p[11]
+
t9
*
p[14]
-
t9
*
p[17]
+
t12
*
p[20]
-
t12
*
p[23]
;
dtau[2][1]
=
-t17
*
p[2]
+
t19
*
p[5]
-
t19
*
p[8]
+
t17
*
p[11]
+
t23
*
p[14]
-
t25
*
p[17]
+
t25
*
p[20]
-
t23
*
p[23]
;
dtau[2][2]
=
-t30
*
p[2]
+
t32
*
p[5]
-
t34
*
p[8]
+
t36
*
p[11]
+
t30
*
p[14]
-
t32
*
p[17]
+
t34
*
p[20]
-
t36
*
p[23]
;
/*
dtau[0][0]
=
-
(
-1
+
z
)
*
(
-1
+
y
)
*
p[0]
+
(
-1
+
z
)
*
(
-1
+
y
)
*
p[3]
-
y
*
(
-1
+
z
)
*
p[6]
+
y
*
(
-1
+
z
)
*
p[9]
+
z
*
(
-1
+
y
)
*
p[12]
-
z
*
(
-1
+
y
)
*
p[15]
+
y
*
z
*
p[18]
-
y
*
z
*
p[21]
; */
/*
dtau[0][1]
=
-
(
-1
+
z
)
*
(
-1
+
x
)
*
p[0]
+
x
*
(
-1
+
z
)
*
p[3]
-
x
*
(
-1
+
z
)
*
p[6]
+
(
-1
+
z
)
*
(
-1
+
x
)
*
p[9]
+
z
*
(
-1
+
x
)
*
p[12]
-
x
*
z
*
p[15]
+
x
*
z
*
p[18]
-
z
*
(
-1
+
x
)
*
p[21]
; */
/*
dtau[0][2]
=
-
(
-1
+
y
)
*
(
-1
+
x
)
*
p[0]
+
x
*
(
-1
+
y
)
*
p[3]
-
x
*
y
*
p[6]
+
y
*
(
-1
+
x
)
*
p[9]
+
(
-1
+
y
)
*
(
-1
+
x
)
*
p[12]
-
x
*
(
-1
+
y
)
*
p[15]
+
x
*
y
*
p[18]
-
y
*
(
-1
+
x
)
*
p[21]
; */
/*
dtau[1][0]
=
-
(
-1
+
z
)
*
(
-1
+
y
)
*
p[1]
+
(
-1
+
z
)
*
(
-1
+
y
)
*
p[4]
-
y
*
(
-1
+
z
)
*
p[7]
+
y
*
(
-1
+
z
)
*
p[10]
+
z
*
(
-1
+
y
)
*
p[13]
-
z
*
(
-1
+
y
)
*
p[16]
+
y
*
z
*
p[19]
-
y
*
z
*
p[22]
; */
/*
dtau[1][1]
=
-
(
-1
+
z
)
*
(
-1
+
x
)
*
p[1]
+
x
*
(
-1
+
z
)
*
p[4]
-
x
*
(
-1
+
z
)
*
p[7]
+
(
-1
+
z
)
*
(
-1
+
x
)
*
p[10]
+
z
*
(
-1
+
x
)
*
p[13]
-
x
*
z
*
p[16]
+
x
*
z
*
p[19]
-
z
*
(
-1
+
x
)
*
p[22]
; */
/*
dtau[1][2]
=
-
(
-1
+
y
)
*
(
-1
+
x
)
*
p[1]
+
x
*
(
-1
+
y
)
*
p[4]
-
x
*
y
*
p[7]
+
y
*
(
-1
+
x
)
*
p[10]
+
(
-1
+
y
)
*
(
-1
+
x
)
*
p[13]
-
x
*
(
-1
+
y
)
*
p[16]
+
x
*
y
*
p[19]
-
y
*
(
-1
+
x
)
*
p[22]
; */
/*
dtau[2][0]
=
-
(
-1
+
z
)
*
(
-1
+
y
)
*
p[2]
+
(
-1
+
z
)
*
(
-1
+
y
)
*
p[5]
-
y
*
(
-1
+
z
)
*
p[8]
+
y
*
(
-1
+
z
)
*
p[11]
+
z
*
(
-1
+
y
)
*
p[14]
-
z
*
(
-1
+
y
)
*
p[17]
+
y
*
z
*
p[20]
-
y
*
z
*
p[23]
; */
/*
dtau[2][1]
=
-
(
-1
+
z
)
*
(
-1
+
x
)
*
p[2]
+
x
*
(
-1
+
z
)
*
p[5]
-
x
*
(
-1
+
z
)
*
p[8]
+
(
-1
+
z
)
*
(
-1
+
x
)
*
p[11]
+
z
*
(
-1
+
x
)
*
p[14]
-
x
*
z
*
p[17]
+
x
*
z
*
p[20]
-
z
*
(
-1
+
x
)
*
p[23]
; */
/*
dtau[2][2]
=
-
(
-1
+
y
)
*
(
-1
+
x
)
*
p[2]
+
x
*
(
-1
+
y
)
*
p[5]
-
x
*
y
*
p[8]
+
y
*
(
-1
+
x
)
*
p[11]
+
(
-1
+
y
)
*
(
-1
+
x
)
*
p[14]
-
x
*
(
-1
+
y
)
*
p[17]
+
x
*
y
*
p[20]
-
y
*
(
-1
+
x
)
*
p[23]
; */
/*
dtau[0][0]
=
1
; */
/*
dtau[0][1]
=
0
; */
/*
dtau[0][2]
=
0
; */
/*
dtau[1][0]
=
0
; */
/*
dtau[1][1]
=
1
; */
/*
dtau[1][2]
=
0
; */
/*
dtau[2][0]
=
0
; */
/*
dtau[2][1]
=
0
; */
/*
dtau[2][2]
=
1
; */
}
__kernel
void
get_nodes
(
int
ie,
//
1:
macrocel
index
__constant
float
*physnodes,
//
2:
macrocell
nodes
__global
float
*node
//
3:
node
coordinates
)
{
__constant
float
*physnode
=
physnodes
+
ie
*
3
*
_NHEXA
;
int
ix
=
get_global_id
(
0
)
;
int
iy
=
get_global_id
(
1
)
;
int
iz
=
get_global_id
(
2
)
;
//int
woffset
=
ie
*
_M
*
_NPG_MACROCELL
;
//
subcell
id
int
icL[3]
;
icL[0]
=
ix
/
_NP
;
icL[1]
=
iy
/
_NP
;
icL[2]
=
iz
/
_NP
;
int
pL[3]
;
pL[0]
=
ix
%
_NP
;
pL[1]
=
iy
%
_NP
;
pL[2]
=
iz
%
_NP
;
//
ref
coordinates
float
hx
=
1.
/
(
float
)
_RAF
;
float
hy
=
1.
/
(
float
)
_RAF
;
float
hz
=
1.
/
(
float
)
_RAF
;
int
offset[3]
=
{gauss_lob_offset[_DEG]
+
pL[0],
gauss_lob_offset[_DEG]
+
pL[1],
gauss_lob_offset[_DEG]
+
pL[2]}
;
float
x
=
hx
*
(
icL[0]
+
gauss_lob_point[offset[0]]
)
;
float
y
=
hy
*
(
icL[1]
+
gauss_lob_point[offset[1]]
)
;
float
z
=
hz
*
(
icL[2]
+
gauss_lob_point[offset[2]]
)
;
int
ipgL
=
pL[0]
+
_NP
*
icL[0]
+
_NPG_ROW
*
(
iy
+
_NPG_ROW
*
iz
)
;
float
xphy[3]
;
get_tau
(
x,
y,
z,
physnode,
xphy
)
;
node[ipgL
+
0
*
_NPG_MACROCELL]
=
xphy[0]
;
node[ipgL
+
1
*
_NPG_MACROCELL]
=
xphy[1]
;
node[ipgL
+
2
*
_NPG_MACROCELL]
=
xphy[2]
;
/*
printf
(
"ipgL=%d ijk=%d %d %d xyz=%f %f %f xphy=%f %f %f\n"
,
*/
/*
ipgL,ix,iy,iz,
*/
/*
x,y,z,xphy[0],xphy[1],xphy[2]
)
; */
}
__kernel
void
Init
(
int
ie,
//
1:
macrocel
index
__constant
float
*physnodes,
//
2:
macrocell
nodes
__global
float
*wn
//
3:
field
values
)
{
__constant
float
*physnode
=
physnodes
+
ie
*
3
*
_NHEXA
;
int
ix
=
get_global_id
(
0
)
;
int
iy
=
get_global_id
(
1
)
;
int
iz
=
get_global_id
(
2
)
;
int
woffset
=
ie
*
_M
*
_NPG_MACROCELL
;
//
subcell
id
int
icL[3]
;
icL[0]
=
ix
/
_NP
;
icL[1]
=
iy
/
_NP
;
icL[2]
=
iz
/
_NP
;
int
pL[3]
;
pL[0]
=
ix
%
_NP
;
pL[1]
=
iy
%
_NP
;
pL[2]
=
iz
%
_NP
;
//
ref
coordinates
float
hx
=
1.
/
_RAF
;
float
hy
=
1.
/
_RAF
;
float
hz
=
1.
/
_RAF
;
int
offset[3]
=
{gauss_lob_offset[_DEG]
+
pL[0],
gauss_lob_offset[_DEG]
+
pL[1],
gauss_lob_offset[_DEG]
+
pL[2]}
;
float
x
=
hx
*
(
icL[0]
+
gauss_lob_point[offset[0]]
)
;
float
y
=
hy
*
(
icL[1]
+
gauss_lob_point[offset[1]]
)
;
float
z
=
hz
*
(
icL[2]
+
gauss_lob_point[offset[2]]
)
;
int
ipgL
=
pL[0]
+
_NP
*
icL[0]
+
_NPG_ROW
*
(
iy
+
_NPG_ROW
*
iz
)
;
float
xphy[3]
;
get_tau
(
x,
y,
z,
physnode,
xphy
)
;
float
wL[_M]
;
float
tim
=
0
;
ImposedData
(
xphy,
tim,
wL
)
;
for
(
int
iv
=
0
; iv < _M; iv++) {
int
imemL
=
Varindex
(
ipgL,
iv
)
+
woffset
;
wn[imemL]
=
wL[iv]
;
//wn[imemL]
=
ix
;
}
}
__kernel
void
DGVolume
(
int
ie,
//
1:
macrocel
index
...
...
@@ -254,14 +402,16 @@ void DGVolume(int ie, // 1: macrocel index
//
subcell
id
int
icL[3]
;
icL[0]
=
ix
/
_
RAF
;
icL[1]
=
iy
/
_
RAF
;
icL[2]
=
iz
/
_
RAF
;
icL[0]
=
ix
/
_
NP
;
icL[1]
=
iy
/
_
NP
;
icL[2]
=
iz
/
_
NP
;
int
p[3]
;
p[0]
=
ix
%
_RAF
;
p[1]
=
iy
%
_RAF
;
p[2]
=
iz
%
_RAF
;
p[0]
=
ix
%
_NP
;
p[1]
=
iy
%
_NP
;
p[2]
=
iz
%
_NP
;
//
ref
coordinates
...
...
@@ -285,8 +435,17 @@ void DGVolume(int ie, // 1: macrocel index
float
codtau[3][3]
;
{
float
dtau[3][3]
;
//printf
(
"xyz=%f %f %f\n"
,
x,y,z
)
;
get_dtau
(
x,
y,
z,
physnode,
dtau
)
; // 1296 mults
/*
printf
(
"dtau=\n%f %f %f \n%f %f %f \n%f %f %f\n"
,
*/
/*
dtau[0][0],dtau[0][1],dtau[0][2],
*/
/*
dtau[1][0],dtau[1][1],dtau[1][2],
*/
/*
dtau[2][0],dtau[2][1],dtau[2][2]
)
; */
/*
for
(
int
ii=0
;ii < 8;ii++){ */
/*
printf
(
"physnode=%f %f %f\n"
,
*/
/*
physnode[3*ii+0],physnode[3*ii+1],physnode[3*ii+2]
)
; */
/*
}
*/
codtau[0][0]
=
dtau[1][1]
*
dtau[2][2]
-
dtau[1][2]
*
dtau[2][1]
;
codtau[0][1]
=
-dtau[1][0]
*
dtau[2][2]
+
dtau[1][2]
*
dtau[2][0]
;
codtau[0][2]
=
dtau[1][0]
*
dtau[2][1]
-
dtau[1][1]
*
dtau[2][0]
;
...
...
@@ -324,21 +483,32 @@ void DGVolume(int ie, // 1: macrocel index
}
NumFlux
(
wL,
wL,
dphi,
flux
)
; // 3m mults when using NumFlux
int
ipgR
=
(
ix
+
iq
)
%
_NP
+
_NPG_ROW
*
(
iy
+
_NPG_ROW
*
iz
)
;
for
(
int
iv
=
0
; iv < _M; iv++) {
int
imemR
=
Varindex
(
ipgR,
iv
)
+
woffset
;
printf
(
"imemR=%d wpg=%f\n"
,
imemR,wpg
)
;
dtwn[imemR]
+=
flux[iv]
*
wpg
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
iv
=
0
; iv < _M; iv++){
int
imemL
=
Varindex
(
ipgL,
iv
)
+
woffset
;
dtwn[imemL]
/=
wpg
;
}
}
//
Compute
the
surface
terms
inside
one
macrocell
__kernel
void
DGFlux
(
int
ie,
//
1:
macrocell
index
__constant
float
*physnodes,
//
3:
macrocell
nodes
__constant
float
*
tnow,
float
tnow,
__global
float
*wn,
//
4:
field
values
__global
float
*dtwn
//
5:
time
derivative
)
...
...
@@ -355,13 +525,13 @@ void DGFlux(int ie, // 1: macrocell index
//
subcell
id
int
icL[3]
;
icL[0]
=
ifx
-
1
;
icL[1]
=
iy
/
_
RAF
;
icL[2]
=
iz
/
_
RAF
;
icL[1]
=
iy
/
_
NP
;
icL[2]
=
iz
/
_
NP
;
int
pL[3]
;
pL[0]
=
_
RAF
-
1
;
pL[1]
=
iy
%
_
RAF
;
pL[2]
=
iz
%
_
RAF
;
pL[0]
=
_
NP
-
1
;
pL[1]
=
iy
%
_
NP
;
pL[2]
=
iz
%
_
NP
;
int
icR[3]
;
...
...
@@ -371,20 +541,32 @@ void DGFlux(int ie, // 1: macrocell index
int
pR[3]
;
pR[0]
=
0
;
pR[1]
=
iy
%
_
RAF
;
pR[2]
=
iz
%
_
RAF
;
pR[1]
=
iy
%
_
NP
;
pR[2]
=
iz
%
_
NP
;
float
hx
=
1.0
/
_RAF
;
float
hy
=
1.0
/
_RAF
;
float
hz
=
1.0
/
_RAF
;
int
offset[3]
=
{gauss_lob_offset[_DEG]
+
pL[0],
gauss_lob_offset[_DEG]
+
pL[1],
gauss_lob_offset[_DEG]
+
pL[2]}
;
float
x
=
hx
*
(
icL[0]
+
gauss_lob_point[offset[0]]
)
;
float
y
=
hy
*
(
icL[1]
+
gauss_lob_point[offset[1]]
)
;
float
z
=
hz
*
(
icL[2]
+
gauss_lob_point[offset[2]]
)
;
int
offset[3]
;
float
x,y,z
;
//if
(
ifx
>
0
)
{
offset[0]=gauss_lob_offset[_DEG]
+
pL[0]
;
offset[1]=gauss_lob_offset[_DEG]
+
pL[1]
;
offset[2]=gauss_lob_offset[_DEG]
+
pL[2]
;
x
=
hx
*
(
icL[0]
+
gauss_lob_point[offset[0]]
)
;
y
=
hy
*
(
icL[1]
+
gauss_lob_point[offset[1]]
)
;
z
=
hz
*
(
icL[2]
+
gauss_lob_point[offset[2]]
)
;
/*
}
else
{
*/
/*
//printf
(
"ifx=%d pR=%d %d %d\n"
,
ifx,pR[0],pR[1],pR[2]
)
; */
/*
offset[0]=gauss_lob_offset[_DEG]
+
pR[0]
; */
/*
offset[1]=gauss_lob_offset[_DEG]
+
pR[1]
; */
/*
offset[2]=gauss_lob_offset[_DEG]
+
pR[2]
; */
/*
x
=
hx
*
(
icR[0]
+
gauss_lob_point[offset[0]]
)
; */
/*
y
=
hy
*
(
icR[1]
+
gauss_lob_point[offset[1]]
)
; */
/*
z
=
hz
*
(
icR[2]
+
gauss_lob_point[offset[2]]
)
; */
/*
}
*/
float
codtau[3][3]
;
{
...
...
@@ -410,19 +592,17 @@ void DGFlux(int ie, // 1: macrocell index
int
ipgL
=
-1
;
if
(
ifx
>
0
)
ipgL
=
pL[0]
+
_
RAF
*
icL[0]
+
_NPG_ROW
*
(
iy
+
_NPG_ROW
*
iz
)
;
ipgL
=
pL[0]
+
_
NP
*
icL[0]
+
_NPG_ROW
*
(
iy
+
_NPG_ROW
*
iz
)
;
int
ipgR
=
-1
;
if
(
ifx
<
_RAF
)
ipgR
=
pR[0]
+
_
RAF
*
icR[0]
+
_NPG_ROW
*
(
iy
+
_NPG_ROW
*
iz
)
;
ipgR
=
pR[0]
+
_
NP
*
icR[0]
+
_NPG_ROW
*
(
iy
+
_NPG_ROW
*
iz
)
;
float
xphy[3]
;
if
(
ipgL
<
0
||
ipgR
<
0
)
{
get_tau
(
x,
y,
z,
physnode,
xphy
)
;
}
get_tau
(
x,
y,
z,
physnode,
xphy
)
;
float
wL[_M],
wR[_M]
;
float
tim
=
*
tnow
;
float
tim
=
tnow
;
for
(
int
iv
=
0
; iv < _M; iv++) {
if
(
ipgL
>=
0
)
{
int
imemL
=
Varindex
(
ipgL,
iv
)
+
woffset
;
...
...
@@ -438,23 +618,22 @@ void DGFlux(int ie, // 1: macrocell index
}
}
//
TODO:
test
ipgL
and
ipgR
for
Boundary
terms
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
float
flux[_M]
;
NumFlux
(
wL,
wR,
vnds,
flux
)
; //
//printf
(
"wL=%f %f\n"
,
wL[0],wL[1]
)
;
float
wpgs
=
wglop
(
_DEG,
pL[1]
)
*
wglop
(
_DEG,
pL[2]
)
;
for
(
int
iv
=
0
; iv < _M; ++iv) {
if
(
ipgL
>=
0
)
{
int
imemL
=
Varindex
(
ipgL,
iv
)
+
woffset
;
printf
(
"imemL=%d wpgs=%f\n"
,
imemL,wpgs
)
;
dtwn[imemL]
-=
flux[iv]
*
wpgs
;
}
if
(
ipgR
>=
0
)
{
int
imemR
=
Varindex
(
ipgR,
iv
)
+
woffset
;
printf
(
"imemR=%d\n"
,
imemR
)
;
dtwn[imemR]
+=
flux[iv]
*
wpgs
;
}
}
...
...
simple_kernels.py
View file @
b465350d
...
...
@@ -40,9 +40,9 @@ verif_cpu = np.fromfunction(lambda i: i * i + i, (taille, ), dtype = np.float32)
prg
=
cl
.
Program
(
ctx
,
source
).
build
()
prg
.
K1
(
queue
,
(
taille
,
),
None
,
x_gpu
)
.
wait
()
event
=
prg
.
K1
(
queue
,
(
taille
,
),
None
,
x_gpu
)
prg
.
K2
(
queue
,
(
taille
,
),
None
,
y_gpu
).
wait
()
prg
.
K3
(
queue
,
(
taille
,
),
None
,
x_gpu
,
y_gpu
,
z_gpu
).
wait
()
prg
.
K3
(
queue
,
(
taille
,
),
None
,
x_gpu
,
y_gpu
,
z_gpu
,
wait_for
=
[
event
]
).
wait
()
cl
.
enqueue_copy
(
queue
,
z_cpu
,
z_gpu
)
...
...
testtranspose.py
View file @
b465350d
...
...
@@ -5,23 +5,31 @@ from __future__ import absolute_import, print_function
import
pyopencl
as
cl
import
numpy
as
np
with
open
(
'kernels.cl'
,
'r'
)
as
f
:
source
=
f
.
read
()
ctx
=
cl
.
create_some_context
()
m
=
2
raf
=
1
deg
=
1
source
=
source
.
replace
(
"THE_DEG"
,
str
(
deg
))
source
=
source
.
replace
(
"THE_RAF"
,
str
(
raf
))
source
=
source
.
replace
(
"THE_M"
,
str
(
m
))
ctx
=
cl
.
create_some_context
()
prg
=
cl
.
Program
(
ctx
,
source
).
build
()
point
=
[
(
0.0
,
0.0
,
0.0
),
(
1.0
,
0.0
,
0.0
),
(
1.0
,
1.0
,
0.0
),
(
0.0
,
1.0
,
0.0
),
(
0.0
,
0.0
,
1.0
),
(
1.0
,
0.0
,
1.0
),
(
1.0
,
1.0
,
1.0
),
(
0.0
,
1.0
,
1.0
)]
point_cpu
=
np
.
array
([
0.0
,
0.0
,
0.0
,
1.0
,
0.0
,
0.0
,
1.0
,
1.0
,
0.0
,
0.0
,
1.0
,
0.0
,
0.0
,
0.0
,
1.0
,
1.0
,
0.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.0
,
1.0
,
1.0
],
dtype
=
np
.
float32
)
element
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
]
...
...
@@ -32,4 +40,56 @@ face2node = [[0,1,5,4],
[
5
,
6
,
7
,
4
],
[
0
,
3
,
2
,
1
]]
queue
=
cl
.
CommandQueue
(
ctx
)
mf
=
cl
.
mem_flags
wsize
=
m
*
raf
**
3
*
(
deg
+
1
)
**
3
row_size
=
raf
*
(
deg
+
1
)
point_gpu
=
cl
.
Buffer
(
ctx
,
mf
.
READ_ONLY
|
mf
.
COPY_HOST_PTR
,
hostbuf
=
point_cpu
)
wn_cpu
=
np
.
zeros
(
wsize
,
dtype
=
np
.
float32
)
wn_gpu
=
cl
.
Buffer
(
ctx
,
mf
.
WRITE_ONLY
,
size
=
(
wsize
*
np
.
dtype
(
'float32'
).
itemsize
))
dtwn_cpu
=
np
.
zeros
(
wsize
,
dtype
=
np
.
float32
)
dtwn_gpu
=
cl
.
Buffer
(
ctx
,
mf
.
READ_WRITE
|
mf
.
COPY_HOST_PTR
,
hostbuf
=
dtwn_cpu
);
ie
=
np
.
int32
(
0
)
prg
.
Init
(
queue
,
(
row_size
,
row_size
,
row_size
),
None
,
ie
,
point_gpu
,
wn_gpu
).
wait
()
cl
.
enqueue_copy
(
queue
,
wn_cpu
,
wn_gpu
)
#print(wn_cpu)
wn_out
=
wn_cpu
.
reshape
((
2
*
row_size
**
2
,
row_size
))
#print(wn_out)
import
matplotlib.pyplot
as
plt
x_cpu
=
np
.
zeros
(
3
*
row_size
**
3
,
dtype
=
np
.
float32
)
x_gpu
=
cl
.
Buffer
(
ctx
,
mf
.
WRITE_ONLY
,
x_cpu
.
nbytes
)
prg
.
get_nodes
(
queue
,
(
row_size
,
row_size
,
row_size
),
None
,
ie
,
point_gpu
,
x_gpu
).
wait
()
cl
.
enqueue_copy
(
queue
,
x_cpu
,
x_gpu
).
wait
()
plt
.
plot
(
x_cpu
[
0
:
row_size
],
wn_out
[
0
])
plt
.
plot
(
x_cpu
[
0
:
row_size
],
wn_out
[
1
])
#plt.show()
#print(x_cpu[0:row_size])
#print(x_cpu[0:row_size])
tnow
=
np
.
float32
(
0
)
npts
=
deg
+
1
prg
.
DGFlux
(
queue
,
(
raf
+
1
,
row_size
,
row_size
),
(
1
,
1
,
1
),
ie
,
point_gpu
,
tnow
,
wn_gpu
,
dtwn_gpu
).
wait
()
prg
.
DGVolume
(
queue
,
(
row_size
,
row_size
,
row_size
),
(
npts
,
npts
,
npts
),
ie
,
point_gpu
,
wn_gpu
,
dtwn_gpu
).
wait
()
cl
.
enqueue_copy
(
queue
,
dtwn_cpu
,
dtwn_gpu
)
print
(
dtwn_cpu
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment