815 lines
28 KiB
C
815 lines
28 KiB
C
/* gx2_shader_gen.c - Fast3D GX2 shader generator for libultraship
|
|
|
|
Created in 2022 by GaryOderNichts
|
|
*/
|
|
#ifdef __WIIU__
|
|
|
|
#include "gx2_shader_gen.h"
|
|
#include "gx2_shader_inl.h"
|
|
|
|
#include <malloc.h>
|
|
#include <gx2/mem.h>
|
|
|
|
#define ROUNDUP(x, align) (((x) + ((align) -1)) & ~((align) -1))
|
|
|
|
#define FRAG_COORD_REG _R0
|
|
#define TEXEL_REG _R1
|
|
#define FOG_REG _R3
|
|
#define GRAYSCALE_REG _R4
|
|
|
|
enum {
|
|
SHADER_TEXINFO0 = SHADER_COMBINED + 1,
|
|
SHADER_TEXINFO1,
|
|
};
|
|
|
|
static uint8_t get_reg(struct CCFeatures *cc_features, uint8_t c) {
|
|
if (c == SHADER_0) {
|
|
return ALU_SRC_0;
|
|
}
|
|
if (c == SHADER_1) {
|
|
return ALU_SRC_1;
|
|
}
|
|
|
|
if (c == SHADER_COMBINED) {
|
|
return TEXEL_REG;
|
|
}
|
|
|
|
if (c >= SHADER_INPUT_1 && c <= SHADER_INPUT_7) {
|
|
return _R(5 + (c - SHADER_INPUT_1));
|
|
}
|
|
|
|
uint8_t input_last = (cc_features->num_inputs + 5) - 1;
|
|
|
|
if (c == SHADER_TEXEL0 || c == SHADER_TEXEL0A) {
|
|
// reuse unused regs
|
|
if (!cc_features->opt_noise) {
|
|
return FRAG_COORD_REG;
|
|
} else if (!cc_features->opt_fog) {
|
|
return FOG_REG;
|
|
} else if (!cc_features->opt_grayscale) {
|
|
return GRAYSCALE_REG;
|
|
}
|
|
|
|
return _R(input_last + 1);
|
|
}
|
|
if (c == SHADER_TEXEL1 || c == SHADER_TEXEL1A) {
|
|
// if the shader doesn't use texture 0 we can reuse it for texture 1
|
|
if (!cc_features->used_textures[0]) {
|
|
return get_reg(cc_features, SHADER_TEXEL0);
|
|
}
|
|
// reuse unused regs which tex 0 doesn't use yet
|
|
else if (!cc_features->opt_fog && get_reg(cc_features, SHADER_TEXEL0) != FOG_REG) {
|
|
return FOG_REG;
|
|
} else if (!cc_features->opt_grayscale && get_reg(cc_features, SHADER_TEXEL0) != GRAYSCALE_REG) {
|
|
return GRAYSCALE_REG;
|
|
}
|
|
|
|
return _R(input_last + 2);
|
|
}
|
|
|
|
// reuse the regs above
|
|
if (c == SHADER_TEXINFO0) {
|
|
return get_reg(cc_features, SHADER_TEXEL0);
|
|
}
|
|
if (c == SHADER_TEXINFO1) {
|
|
return get_reg(cc_features, SHADER_TEXEL1);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static uint8_t get_num_regs(struct CCFeatures *cc_features) {
|
|
uint8_t input_count = cc_features->num_inputs + 5;
|
|
|
|
uint8_t last_tex_reg;
|
|
if (cc_features->used_textures[1]) {
|
|
last_tex_reg = get_reg(cc_features, SHADER_TEXEL1) + 1;
|
|
} else if (cc_features->used_textures[0]) {
|
|
last_tex_reg = get_reg(cc_features, SHADER_TEXEL0) + 1;
|
|
} else {
|
|
return input_count;
|
|
}
|
|
|
|
return (last_tex_reg < input_count) ? input_count : last_tex_reg;
|
|
}
|
|
|
|
#define ADD_INSTR(...) \
|
|
uint64_t tmp[] = {__VA_ARGS__}; \
|
|
memcpy(*alu_ptr, tmp, sizeof(tmp)); \
|
|
*alu_ptr += sizeof(tmp) / sizeof(uint64_t)
|
|
|
|
static inline void add_tex_clamp_S_T(struct CCFeatures *cc_features, uint64_t **alu_ptr, uint8_t tex) {
|
|
uint8_t texinfo_reg = get_reg(cc_features, (tex == 0) ? SHADER_TEXINFO0 : SHADER_TEXINFO1);
|
|
uint8_t texcoord_reg = (tex == 0) ? _R1 : _R2;
|
|
|
|
ADD_INSTR(
|
|
/* R127.xy = (float) texinfo.xy */
|
|
ALU_INT_TO_FLT(_R127, _x, texinfo_reg, _x) SCL_210
|
|
ALU_LAST,
|
|
|
|
ALU_INT_TO_FLT(_R127, _y, texinfo_reg, _y) SCL_210
|
|
ALU_LAST,
|
|
|
|
/* R127.xy = 0.5f / texSize */
|
|
ALU_RECIP_IEEE(__, _x, _R127, _x) SCL_210
|
|
ALU_LAST,
|
|
|
|
ALU_MUL_IEEE(_R127, _x, ALU_SRC_PS, _x, ALU_SRC_0_5, _x),
|
|
ALU_RECIP_IEEE(__, _y, _R127, _y) SCL_210
|
|
ALU_LAST,
|
|
|
|
ALU_MUL_IEEE(_R127, _y, ALU_SRC_PS, _y, ALU_SRC_0_5, _x)
|
|
ALU_LAST,
|
|
|
|
/* texCoord.xy = clamp(texCoord.xy, R127.xy, texClamp.xy) */
|
|
ALU_MAX(__, _x, texcoord_reg, _x, _R127, _x),
|
|
ALU_MAX(__, _y, texcoord_reg, _y, _R127, _y)
|
|
ALU_LAST,
|
|
|
|
ALU_MIN(texcoord_reg, _x, ALU_SRC_PV, _x, texcoord_reg, _z),
|
|
ALU_MIN(texcoord_reg, _y, ALU_SRC_PV, _y, texcoord_reg, _w)
|
|
ALU_LAST,
|
|
);
|
|
}
|
|
|
|
static inline void add_tex_clamp_S(struct CCFeatures *cc_features, uint64_t **alu_ptr, uint8_t tex) {
|
|
uint8_t texinfo_reg = get_reg(cc_features, (tex == 0) ? SHADER_TEXINFO0 : SHADER_TEXINFO1);
|
|
uint8_t texcoord_reg = (tex == 0) ? _R1 : _R2;
|
|
|
|
ADD_INSTR(
|
|
/* R127.x = (float) texinfo.x */
|
|
ALU_INT_TO_FLT(_R127, _x, texinfo_reg, _x) SCL_210
|
|
ALU_LAST,
|
|
|
|
/* R127.x = 0.5f / texSize */
|
|
ALU_RECIP_IEEE(__, _x, _R127, _x) SCL_210
|
|
ALU_LAST,
|
|
|
|
ALU_MUL_IEEE(_R127, _x, ALU_SRC_PS, _x, ALU_SRC_0_5, _x)
|
|
ALU_LAST,
|
|
|
|
/* texCoord.xy = clamp(texCoord.xy, R127.xy, texClamp.xy) */
|
|
ALU_MAX(__, _x, texcoord_reg, _x, _R127, _x)
|
|
ALU_LAST,
|
|
|
|
ALU_MIN(texcoord_reg, _x, ALU_SRC_PV, _x, texcoord_reg, _z)
|
|
ALU_LAST,
|
|
);
|
|
}
|
|
|
|
static inline void add_tex_clamp_T(struct CCFeatures *cc_features, uint64_t **alu_ptr, uint8_t tex) {
|
|
uint8_t texinfo_reg = get_reg(cc_features, (tex == 0) ? SHADER_TEXINFO0 : SHADER_TEXINFO1);
|
|
uint8_t texcoord_reg = (tex == 0) ? _R1 : _R2;
|
|
|
|
ADD_INSTR(
|
|
/* R127.y = (float) texinfo.y */
|
|
ALU_INT_TO_FLT(_R127, _y, texinfo_reg, _y) SCL_210
|
|
ALU_LAST,
|
|
|
|
/* R127.y = 0.5f / texSize */
|
|
ALU_RECIP_IEEE(__, _x, _R127, _y) SCL_210
|
|
ALU_LAST,
|
|
|
|
ALU_MUL_IEEE(_R127, _y, ALU_SRC_PS, _x, ALU_SRC_0_5, _x)
|
|
ALU_LAST,
|
|
|
|
/* texCoord.xy = clamp(texCoord.xy, R127.xy, texClamp.xy) */
|
|
ALU_MAX(__, _y, texcoord_reg, _y, _R127, _y)
|
|
ALU_LAST,
|
|
|
|
ALU_MIN(texcoord_reg, _y, ALU_SRC_PV, _y, texcoord_reg, _w)
|
|
ALU_LAST,
|
|
);
|
|
}
|
|
|
|
static inline void add_mov(struct CCFeatures *cc_features, uint64_t **alu_ptr, uint8_t src, bool single) {
|
|
bool src_alpha = (src == SHADER_TEXEL0A) || (src == SHADER_TEXEL1A);
|
|
src = get_reg(cc_features, src);
|
|
|
|
/* texel = src */
|
|
if (single) {
|
|
ADD_INSTR(
|
|
ALU_MOV(TEXEL_REG, _w, src, _w)
|
|
ALU_LAST,
|
|
);
|
|
} else {
|
|
ADD_INSTR(
|
|
ALU_MOV(TEXEL_REG, _x, src, src_alpha ? _w :_x),
|
|
ALU_MOV(TEXEL_REG, _y, src, src_alpha ? _w :_y),
|
|
ALU_MOV(TEXEL_REG, _z, src, src_alpha ? _w :_z)
|
|
ALU_LAST,
|
|
);
|
|
}
|
|
}
|
|
|
|
static inline void add_mul(struct CCFeatures *cc_features, uint64_t **alu_ptr, uint8_t src0, uint8_t src1, bool single) {
|
|
bool src0_alpha = (src0 == SHADER_TEXEL0A) || (src0 == SHADER_TEXEL1A);
|
|
bool src1_alpha = (src1 == SHADER_TEXEL0A) || (src1 == SHADER_TEXEL1A);
|
|
src0 = get_reg(cc_features, src0);
|
|
src1 = get_reg(cc_features, src1);
|
|
|
|
/* texel = src0 * src1 */
|
|
if (single) {
|
|
ADD_INSTR(
|
|
ALU_MUL(TEXEL_REG, _w, src0, _w, src1, _w)
|
|
ALU_LAST,
|
|
);
|
|
} else {
|
|
ADD_INSTR(
|
|
ALU_MUL(TEXEL_REG, _x, src0, src0_alpha ? _w : _x, src1, src1_alpha ? _w : _x),
|
|
ALU_MUL(TEXEL_REG, _y, src0, src0_alpha ? _w : _y, src1, src1_alpha ? _w : _y),
|
|
ALU_MUL(TEXEL_REG, _z, src0, src0_alpha ? _w : _z, src1, src1_alpha ? _w : _z)
|
|
ALU_LAST,
|
|
);
|
|
}
|
|
}
|
|
|
|
static inline void add_mix(struct CCFeatures *cc_features, uint64_t **alu_ptr, uint8_t src0, uint8_t src1, uint8_t src2, uint8_t src3, bool single) {
|
|
bool src0_alpha = (src0 == SHADER_TEXEL0A) || (src0 == SHADER_TEXEL1A);
|
|
bool src1_alpha = (src1 == SHADER_TEXEL0A) || (src1 == SHADER_TEXEL1A);
|
|
bool src2_alpha = (src2 == SHADER_TEXEL0A) || (src2 == SHADER_TEXEL1A);
|
|
bool src3_alpha = (src3 == SHADER_TEXEL0A) || (src3 == SHADER_TEXEL1A);
|
|
src0 = get_reg(cc_features, src0);
|
|
src1 = get_reg(cc_features, src1);
|
|
src2 = get_reg(cc_features, src2);
|
|
src3 = get_reg(cc_features, src3);
|
|
|
|
/* texel = (src0 - src1) * src2 - src3 */
|
|
if (single) {
|
|
ADD_INSTR(
|
|
ALU_ADD(__, _w, src0, _w, src1 _NEG, _w)
|
|
ALU_LAST,
|
|
|
|
ALU_MULADD(TEXEL_REG, _w, ALU_SRC_PV, _w, src2, _w, src3, _w)
|
|
ALU_LAST,
|
|
);
|
|
} else {
|
|
ADD_INSTR(
|
|
ALU_ADD(__, _x, src0, src0_alpha ? _w : _x, src1 _NEG, src1_alpha ? _w : _x),
|
|
ALU_ADD(__, _y, src0, src0_alpha ? _w : _y, src1 _NEG, src1_alpha ? _w : _y),
|
|
ALU_ADD(__, _z, src0, src0_alpha ? _w : _z, src1 _NEG, src1_alpha ? _w : _z)
|
|
ALU_LAST,
|
|
|
|
ALU_MULADD(TEXEL_REG, _x, ALU_SRC_PV, _x, src2, src2_alpha ? _w : _x, src3, src3_alpha ? _w : _x),
|
|
ALU_MULADD(TEXEL_REG, _y, ALU_SRC_PV, _y, src2, src2_alpha ? _w : _y, src3, src3_alpha ? _w : _y),
|
|
ALU_MULADD(TEXEL_REG, _z, ALU_SRC_PV, _z, src2, src2_alpha ? _w : _z, src3, src3_alpha ? _w : _z)
|
|
ALU_LAST,
|
|
);
|
|
}
|
|
}
|
|
#undef ADD_INSTR
|
|
|
|
static void append_tex_clamp(struct CCFeatures *cc_features, uint64_t **alu_ptr, uint8_t tex, bool s, bool t) {
|
|
if (s && t) {
|
|
add_tex_clamp_S_T(cc_features, alu_ptr, tex);
|
|
} else if (s) {
|
|
add_tex_clamp_S(cc_features, alu_ptr, tex);
|
|
} else {
|
|
add_tex_clamp_T(cc_features, alu_ptr, tex);
|
|
}
|
|
}
|
|
|
|
static void append_formula(struct CCFeatures *cc_features, uint64_t **alu_ptr, uint8_t c[2][4], bool do_single, bool do_multiply, bool do_mix, bool only_alpha) {
|
|
if (do_single) {
|
|
add_mov(cc_features, alu_ptr, c[only_alpha][3], only_alpha);
|
|
} else if (do_multiply) {
|
|
add_mul(cc_features, alu_ptr, c[only_alpha][0], c[only_alpha][2], only_alpha);
|
|
} else if (do_mix) {
|
|
add_mix(cc_features, alu_ptr, c[only_alpha][0], c[only_alpha][1], c[only_alpha][2], c[only_alpha][1], only_alpha);
|
|
} else {
|
|
add_mix(cc_features, alu_ptr, c[only_alpha][0], c[only_alpha][1], c[only_alpha][2], c[only_alpha][3], only_alpha);
|
|
}
|
|
}
|
|
|
|
static const uint64_t noise_instructions[] = {
|
|
/* R127 = floor(gl_FragCoord.xy * window_params.x) */
|
|
ALU_MUL(__, _x, FRAG_COORD_REG, _x, _C(0), _x),
|
|
ALU_MUL(__, _y, FRAG_COORD_REG, _y, _C(0), _x)
|
|
ALU_LAST,
|
|
|
|
ALU_FLOOR(_R127, _x, ALU_SRC_PV, _x),
|
|
ALU_FLOOR(_R127, _y, ALU_SRC_PV, _y)
|
|
ALU_LAST,
|
|
|
|
/* R127 = sin(vec3(R127.x, R127.y, window_params.y)) */
|
|
ALU_MULADD(_R127, _x, _R127, _x, ALU_SRC_LITERAL, _x, ALU_SRC_0_5, _x),
|
|
ALU_MULADD(_R127, _y, _R127, _y, ALU_SRC_LITERAL, _x, ALU_SRC_0_5, _x),
|
|
ALU_MULADD(_R127, _z, _C(0), _y, ALU_SRC_LITERAL, _x, ALU_SRC_0_5, _x)
|
|
ALU_LAST,
|
|
ALU_LITERAL(0x3E22F983 /* 0.1591549367f (radians -> revolutions) */),
|
|
|
|
ALU_FRACT(__, _x, _R127, _x),
|
|
ALU_FRACT(__, _y, _R127, _y),
|
|
ALU_FRACT(__, _z, _R127, _z)
|
|
ALU_LAST,
|
|
|
|
ALU_MULADD(_R127, _x, ALU_SRC_PV, _x, ALU_SRC_LITERAL, _x, ALU_SRC_LITERAL, _y),
|
|
ALU_MULADD(_R127, _y, ALU_SRC_PV, _y, ALU_SRC_LITERAL, _x, ALU_SRC_LITERAL, _y),
|
|
ALU_MULADD(_R127, _z, ALU_SRC_PV, _z, ALU_SRC_LITERAL, _x, ALU_SRC_LITERAL, _y)
|
|
ALU_LAST,
|
|
ALU_LITERAL2(0x40C90FDB /* 6.283185482f (tau) */, 0xC0490FDB /* -3.141592741f (-pi) */),
|
|
|
|
ALU_MUL(_R127, _x, ALU_SRC_PV, _x, ALU_SRC_LITERAL, _x),
|
|
ALU_MUL(_R127, _y, ALU_SRC_PV, _y, ALU_SRC_LITERAL, _x),
|
|
ALU_MUL(_R127, _z, ALU_SRC_PV, _z, ALU_SRC_LITERAL, _x)
|
|
ALU_LAST,
|
|
ALU_LITERAL(0x3E22F983 /* 0.1591549367f (radians -> revolutions) */),
|
|
|
|
ALU_SIN(_R127, _x, _R127, _x) SCL_210
|
|
ALU_LAST,
|
|
|
|
ALU_SIN(_R127, _y, _R127, _y) SCL_210
|
|
ALU_LAST,
|
|
|
|
ALU_SIN(_R127, _z, _R127, _z) SCL_210
|
|
ALU_LAST,
|
|
|
|
/* R127.x = dot(R127.xyz, vec3(12.9898, 78.233, 37.719)); */
|
|
ALU_DOT4(_R127, _x, _R127, _x, ALU_SRC_LITERAL, _x),
|
|
ALU_DOT4(__, _y, _R127, _y, ALU_SRC_LITERAL, _y),
|
|
ALU_DOT4(__, _z, _R127, _z, ALU_SRC_LITERAL, _z),
|
|
ALU_DOT4(__, _w, ALU_SRC_LITERAL, _w, ALU_SRC_0, _x)
|
|
ALU_LAST,
|
|
ALU_LITERAL4(0x414FD639 /* 12.9898f */, 0x429C774C /* 78.233f */, 0x4216E042 /* 37.719f */, 0x80000000 /* -0.0f */),
|
|
|
|
/* R127.x = fract(sin(R127.x) * 143758.5453); */
|
|
ALU_MULADD(_R127, _x, _R127, _x, ALU_SRC_LITERAL, _x, ALU_SRC_0_5, _x)
|
|
ALU_LAST,
|
|
ALU_LITERAL(0x3E22F983 /* 0.1591549367f (radians -> revolutions) */),
|
|
|
|
ALU_FRACT(__, _x, _R127, _x)
|
|
ALU_LAST,
|
|
|
|
ALU_MULADD(_R127, _x, ALU_SRC_PV, _x, ALU_SRC_LITERAL, _x, ALU_SRC_LITERAL, _y)
|
|
ALU_LAST,
|
|
ALU_LITERAL2(0x40C90FDB /* 6.283185482f (tau) */, 0xC0490FDB /* -3.141592741f (-pi) */),
|
|
|
|
ALU_SIN(_R127, _x, _R127, _x) SCL_210
|
|
ALU_LAST,
|
|
|
|
ALU_MUL(__, _x, _R127, _x, ALU_SRC_LITERAL, _x)
|
|
ALU_LAST,
|
|
ALU_LITERAL(0x480C63A3 /* 143758.5453f */),
|
|
|
|
ALU_FRACT( _R127, _x, ALU_SRC_PV, _x)
|
|
ALU_LAST,
|
|
|
|
/* texel.a *= floor(R127.x + 0.5); */
|
|
ALU_ADD(__, _x, _R127, _x, ALU_SRC_0_5, _x)
|
|
ALU_LAST,
|
|
|
|
ALU_FLOOR(__, _x, ALU_SRC_PV, _x)
|
|
ALU_LAST,
|
|
|
|
ALU_MUL(TEXEL_REG, _w, TEXEL_REG, _w, ALU_SRC_PV, _x)
|
|
ALU_LAST,
|
|
};
|
|
|
|
static GX2UniformVar uniformVars[] = {
|
|
{ "window_params", GX2_SHADER_VAR_TYPE_FLOAT2, 1, 0, -1, },
|
|
};
|
|
|
|
static GX2SamplerVar samplerVars[] = {
|
|
{ "uTex0", GX2_SAMPLER_VAR_TYPE_SAMPLER_2D, 0 },
|
|
{ "uTex1", GX2_SAMPLER_VAR_TYPE_SAMPLER_2D, 1 },
|
|
};
|
|
|
|
#define ADD_INSTR(...) \
|
|
do { \
|
|
uint64_t tmp[] = {__VA_ARGS__}; \
|
|
memcpy(cur_buf, tmp, sizeof(tmp)); \
|
|
cur_buf += sizeof(tmp) / sizeof(uint64_t); \
|
|
} while (0)
|
|
|
|
static int generatePixelShader(GX2PixelShader *psh, struct CCFeatures *cc_features) {
|
|
static const size_t max_program_buf_size = 512 * sizeof(uint64_t);
|
|
uint64_t *program_buf = memalign(GX2_SHADER_PROGRAM_ALIGNMENT, max_program_buf_size);
|
|
if (!program_buf) {
|
|
return -1;
|
|
}
|
|
|
|
memset(program_buf, 0, max_program_buf_size);
|
|
|
|
// start placing alus at offset 32
|
|
static const uint32_t base_alu_offset = 32;
|
|
uint64_t *cur_buf = NULL;
|
|
|
|
// check if we need to clamp
|
|
bool texclamp[2] = { false, false };
|
|
for (int i = 0; i < 2; i++) {
|
|
if (cc_features->used_textures[i]) {
|
|
if (cc_features->clamp[i][0] || cc_features->clamp[i][1]) {
|
|
texclamp[i] = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
uint32_t texclamp_alu_offset = base_alu_offset;
|
|
uint32_t texclamp_alu_size = 0;
|
|
uint32_t texclamp_alu_cnt = 0;
|
|
|
|
if (texclamp[0] || texclamp[1]) {
|
|
// texclamp alu
|
|
cur_buf = program_buf + texclamp_alu_offset;
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
if (cc_features->used_textures[i] && texclamp[i]) {
|
|
append_tex_clamp(cc_features, &cur_buf, i, cc_features->clamp[i][0], cc_features->clamp[i][1]);
|
|
}
|
|
}
|
|
|
|
texclamp_alu_size = (uintptr_t) cur_buf - ((uintptr_t) (program_buf + texclamp_alu_offset));
|
|
texclamp_alu_cnt = texclamp_alu_size / sizeof(uint64_t);
|
|
}
|
|
|
|
// main alu0
|
|
uint32_t main_alu0_offset = texclamp_alu_offset + texclamp_alu_cnt;
|
|
cur_buf = program_buf + main_alu0_offset;
|
|
|
|
for (int c = 0; c < (cc_features->opt_2cyc ? 2 : 1); c++) {
|
|
append_formula(cc_features, &cur_buf, cc_features->c[c], cc_features->do_single[c][0], cc_features->do_multiply[c][0], cc_features->do_mix[c][0], false);
|
|
if (cc_features->opt_alpha) {
|
|
append_formula(cc_features, &cur_buf, cc_features->c[c], cc_features->do_single[c][1], cc_features->do_multiply[c][1], cc_features->do_mix[c][1], true);
|
|
}
|
|
}
|
|
|
|
if (cc_features->opt_fog) {
|
|
ADD_INSTR(
|
|
/* texel.rgb = mix(texel.rgb, vFog.rgb, vFog.a); */
|
|
ALU_ADD(__, _x, FOG_REG, _x, _R1 _NEG, _x),
|
|
ALU_ADD(__, _y, FOG_REG, _y, _R1 _NEG, _y),
|
|
ALU_ADD(__, _z, FOG_REG, _z, _R1 _NEG, _z)
|
|
ALU_LAST,
|
|
|
|
ALU_MULADD(TEXEL_REG, _x, ALU_SRC_PV, _x, FOG_REG, _w, TEXEL_REG, _x),
|
|
ALU_MULADD(TEXEL_REG, _y, ALU_SRC_PV, _y, FOG_REG, _w, TEXEL_REG, _y),
|
|
ALU_MULADD(TEXEL_REG, _z, ALU_SRC_PV, _z, FOG_REG, _w, TEXEL_REG, _z)
|
|
ALU_LAST,
|
|
);
|
|
}
|
|
|
|
if (cc_features->opt_texture_edge && cc_features->opt_alpha) {
|
|
ADD_INSTR(
|
|
/* if (texel.a > 0.19) texel.a = 1.0; else discard; */
|
|
ALU_KILLGT(__, _x, ALU_SRC_LITERAL, _x, TEXEL_REG, _w),
|
|
ALU_MOV(TEXEL_REG, _w, ALU_SRC_1, _x)
|
|
ALU_LAST,
|
|
ALU_LITERAL(0x3e428f5c /*0.19f*/),
|
|
);
|
|
}
|
|
|
|
const uint32_t main_alu0_size = (uintptr_t) cur_buf - ((uintptr_t) (program_buf + main_alu0_offset));
|
|
const uint32_t main_alu0_cnt = main_alu0_size / sizeof(uint64_t);
|
|
|
|
// main alu1
|
|
// place the following instructions into a new alu, in case the other alu uses KILL
|
|
const uint32_t main_alu1_offset = main_alu0_offset + main_alu0_cnt;
|
|
cur_buf = program_buf + main_alu1_offset;
|
|
|
|
if (cc_features->opt_alpha && cc_features->opt_noise) {
|
|
memcpy(cur_buf, noise_instructions, sizeof(noise_instructions));
|
|
cur_buf += sizeof(noise_instructions) / sizeof(uint64_t);
|
|
}
|
|
|
|
if (cc_features->opt_grayscale) {
|
|
ADD_INSTR(
|
|
/* texel.r + texel.g + texel.b */
|
|
ALU_ADD(__, _x, TEXEL_REG, _x, TEXEL_REG, _y)
|
|
ALU_LAST,
|
|
|
|
ALU_ADD(__, _x, ALU_SRC_PV, _x, TEXEL_REG, _z)
|
|
ALU_LAST,
|
|
|
|
/* PV.x / 3 */
|
|
ALU_MUL_IEEE(__, _x, ALU_SRC_PV, _x, ALU_SRC_LITERAL, _x)
|
|
ALU_LAST,
|
|
ALU_LITERAL(0x3eaaaaab /*0.3333333433f*/),
|
|
|
|
/* texel.rgb = mix(texel.rgb, vGrayscaleColor.rgb * intensity, vGrayscaleColor.a); */
|
|
ALU_MULADD(_R127, _x, GRAYSCALE_REG, _x, ALU_SRC_PV, _x, _R1 _NEG, _x),
|
|
ALU_MULADD(_R127, _y, GRAYSCALE_REG, _y, ALU_SRC_PV, _x, _R1 _NEG, _y),
|
|
ALU_MULADD(_R127, _z, GRAYSCALE_REG, _z, ALU_SRC_PV, _x, _R1 _NEG, _z)
|
|
ALU_LAST,
|
|
|
|
ALU_MULADD(TEXEL_REG, _x, ALU_SRC_PV, _x, GRAYSCALE_REG, _w, TEXEL_REG, _x),
|
|
ALU_MULADD(TEXEL_REG, _y, ALU_SRC_PV, _y, GRAYSCALE_REG, _w, TEXEL_REG, _y),
|
|
ALU_MULADD(TEXEL_REG, _z, ALU_SRC_PV, _z, GRAYSCALE_REG, _w, TEXEL_REG, _z)
|
|
ALU_LAST,
|
|
);
|
|
}
|
|
|
|
if (cc_features->opt_alpha) {
|
|
if (cc_features->opt_alpha_threshold) {
|
|
ADD_INSTR(
|
|
/* if (texel.a < 8.0 / 256.0) discard; */
|
|
ALU_KILLGT(__, _x, ALU_SRC_LITERAL, _x, TEXEL_REG, _w)
|
|
ALU_LAST,
|
|
ALU_LITERAL(0x3d000000 /*0.03125f*/),
|
|
);
|
|
}
|
|
|
|
if (cc_features->opt_invisible) {
|
|
ADD_INSTR(
|
|
/* texel.a = 0.0; */
|
|
ALU_MOV(TEXEL_REG, _w, ALU_SRC_0, _x)
|
|
ALU_LAST,
|
|
);
|
|
}
|
|
}
|
|
|
|
const uint32_t main_alu1_size = (uintptr_t) cur_buf - ((uintptr_t) (program_buf + main_alu1_offset));
|
|
const uint32_t main_alu1_cnt = main_alu1_size / sizeof(uint64_t);
|
|
|
|
// tex
|
|
uint32_t num_textures = cc_features->used_textures[0] + cc_features->used_textures[1];
|
|
uint32_t num_texinfo = texclamp[0] + texclamp[1];
|
|
|
|
uint32_t texinfo_offset = ROUNDUP(main_alu1_offset + main_alu1_cnt, 16);
|
|
uint32_t cur_tex_offset = texinfo_offset;
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
if (cc_features->used_textures[i] && texclamp[i]) {
|
|
uint8_t dst_reg = get_reg(cc_features, (i == 0) ? SHADER_TEXINFO0 : SHADER_TEXINFO1);
|
|
|
|
uint64_t texinfo_buf[] = {
|
|
TEX_GET_TEXTURE_INFO(dst_reg, _x, _y, _m, _m, _R1, _0, _0, _0, _0, _t(i), _s(i))
|
|
};
|
|
|
|
memcpy(program_buf + cur_tex_offset, texinfo_buf, sizeof(texinfo_buf));
|
|
cur_tex_offset += sizeof(texinfo_buf) / sizeof(uint64_t);
|
|
}
|
|
}
|
|
|
|
uint32_t texsample_offset = cur_tex_offset;
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
if (cc_features->used_textures[i]) {
|
|
uint8_t texcoord_reg = (i == 0) ? _R1 : _R2;
|
|
uint8_t dst_reg = get_reg(cc_features, (i == 0) ? SHADER_TEXEL0 : SHADER_TEXEL1);
|
|
|
|
uint64_t tex_buf[] = {
|
|
TEX_SAMPLE(dst_reg, _x, _y, _z, _w, texcoord_reg, _x, _y, _0, _x, _t(i), _s(i))
|
|
};
|
|
|
|
memcpy(program_buf + cur_tex_offset, tex_buf, sizeof(tex_buf));
|
|
cur_tex_offset += sizeof(tex_buf) / sizeof(uint64_t);
|
|
}
|
|
}
|
|
|
|
// make sure we didn't overflow the buffer
|
|
const uint32_t total_program_size = cur_tex_offset * sizeof(uint64_t);
|
|
assert(total_program_size <= max_program_buf_size);
|
|
|
|
// cf
|
|
uint32_t cur_cf_offset = 0;
|
|
|
|
// if we use texclamp place those alus first
|
|
if (texclamp[0] || texclamp[1]) {
|
|
program_buf[cur_cf_offset++] = TEX(texinfo_offset, num_texinfo);
|
|
program_buf[cur_cf_offset++] = ALU(texclamp_alu_offset, texclamp_alu_cnt);
|
|
}
|
|
|
|
if (num_textures > 0) {
|
|
program_buf[cur_cf_offset++] = TEX(texsample_offset, num_textures) VALID_PIX;
|
|
}
|
|
|
|
program_buf[cur_cf_offset++] = ALU(main_alu0_offset, main_alu0_cnt);
|
|
|
|
if (main_alu1_cnt > 0) {
|
|
program_buf[cur_cf_offset++] = ALU(main_alu1_offset, main_alu1_cnt);
|
|
}
|
|
|
|
if (cc_features->opt_alpha) {
|
|
program_buf[cur_cf_offset++] = EXP_DONE(PIX0, TEXEL_REG, _x, _y, _z, _w) END_OF_PROGRAM;
|
|
} else {
|
|
program_buf[cur_cf_offset++] = EXP_DONE(PIX0, TEXEL_REG, _x, _y, _z, _1) END_OF_PROGRAM;
|
|
}
|
|
|
|
// regs
|
|
const uint32_t num_ps_inputs = 4 + cc_features->num_inputs;
|
|
|
|
psh->regs.sq_pgm_resources_ps = get_num_regs(cc_features); // num_gprs
|
|
psh->regs.sq_pgm_exports_ps = 2; // export_mode
|
|
psh->regs.spi_ps_in_control_0 = (num_ps_inputs + 1) // num_interp
|
|
| (1 << 8) // position_ena
|
|
| (1 << 26) // persp_gradient_ena
|
|
| (1 << 28); // baryc_sample_cntl
|
|
|
|
psh->regs.num_spi_ps_input_cntl = num_ps_inputs + 1;
|
|
|
|
// frag pos
|
|
psh->regs.spi_ps_input_cntls[0] = 0 | (1 << 8);
|
|
|
|
// inputs
|
|
for (uint32_t i = 0; i < num_ps_inputs; i++) {
|
|
psh->regs.spi_ps_input_cntls[i + 1] = i | (1 << 8);
|
|
}
|
|
|
|
psh->regs.cb_shader_mask = 0xf; // output0_enable
|
|
psh->regs.cb_shader_control = 1; // rt0_enable
|
|
psh->regs.db_shader_control = (1 << 4) // z_order
|
|
| (1 << 6); // kill_enable
|
|
|
|
// program
|
|
psh->size = total_program_size;
|
|
psh->program = program_buf;
|
|
|
|
psh->mode = GX2_SHADER_MODE_UNIFORM_REGISTER;
|
|
|
|
// uniform vars
|
|
psh->uniformVars = uniformVars;
|
|
psh->uniformVarCount = sizeof(uniformVars) / sizeof(GX2UniformVar);
|
|
|
|
// samplers
|
|
psh->samplerVars = samplerVars;
|
|
psh->samplerVarCount = sizeof(samplerVars) / sizeof(GX2SamplerVar);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static GX2AttribVar attribVars[] = {
|
|
{ "aVtxPos", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 0 },
|
|
{ "aTexCoord0", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 1 },
|
|
{ "aTexCoord1", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 2 },
|
|
{ "aFog", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 3 },
|
|
{ "aGrayscaleColor", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 4 },
|
|
{ "aInput1", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 5 },
|
|
{ "aInput2", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 6 },
|
|
{ "aInput3", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 7 },
|
|
{ "aInput4", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 8 },
|
|
{ "aInput5", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 9 },
|
|
{ "aInput6", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 10 },
|
|
{ "aInput7", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 11 },
|
|
};
|
|
|
|
static int generateVertexShader(GX2VertexShader *vsh, struct CCFeatures *cc_features) {
|
|
static const size_t max_program_buf_size = 16 * sizeof(uint64_t);
|
|
uint64_t *program_buf = memalign(GX2_SHADER_PROGRAM_ALIGNMENT, max_program_buf_size);
|
|
if (!program_buf) {
|
|
return -1;
|
|
}
|
|
|
|
const uint32_t num_ps_inputs = 4 + cc_features->num_inputs;
|
|
|
|
uint64_t *cur_buf = program_buf;
|
|
|
|
// aVtxPos
|
|
ADD_INSTR(
|
|
CALL_FS NO_BARRIER,
|
|
EXP_DONE(POS0, _R1, _x, _y, _z, _w),
|
|
);
|
|
|
|
// params
|
|
for (uint32_t i = 0; i < num_ps_inputs - 1; i++) {
|
|
ADD_INSTR(
|
|
EXP(PARAM(i), _R(i + 2), _x, _y, _z, _w) NO_BARRIER,
|
|
);
|
|
}
|
|
|
|
// last param
|
|
ADD_INSTR(
|
|
(EXP_DONE(PARAM(num_ps_inputs - 1), _R(num_ps_inputs + 1), _x, _y, _z, _w) NO_BARRIER)
|
|
END_OF_PROGRAM,
|
|
);
|
|
|
|
const uint32_t program_size = (uintptr_t) cur_buf - ((uintptr_t) program_buf);
|
|
assert(program_size <= max_program_buf_size);
|
|
|
|
// regs
|
|
vsh->regs.sq_pgm_resources_vs = (num_ps_inputs + 2) // num_gprs
|
|
| (1 << 8); // stack_size
|
|
|
|
// num outputs minus 1
|
|
vsh->regs.spi_vs_out_config = ((num_ps_inputs - 1) << 1);
|
|
|
|
vsh->regs.num_spi_vs_out_id = 3;
|
|
memset(vsh->regs.spi_vs_out_id, 0xff, sizeof(vsh->regs.spi_vs_out_id));
|
|
vsh->regs.spi_vs_out_id[0] = (0) | (1 << 8) | (2 << 16) | (3 << 24);
|
|
vsh->regs.spi_vs_out_id[1] = (4) | (5 << 8) | (6 << 16) | (7 << 24);
|
|
vsh->regs.spi_vs_out_id[2] = (8) | (9 << 8) | (10 << 16) | (0xff << 24);
|
|
|
|
vsh->regs.sq_vtx_semantic_clear = ~((1 << 12) - 1);
|
|
vsh->regs.num_sq_vtx_semantic = 12;
|
|
memset(vsh->regs.sq_vtx_semantic, 0xff, sizeof(vsh->regs.sq_vtx_semantic));
|
|
// aVtxPos
|
|
vsh->regs.sq_vtx_semantic[0] = 0;
|
|
// aTexCoord0
|
|
vsh->regs.sq_vtx_semantic[1] = 1;
|
|
// aTexCoord1
|
|
vsh->regs.sq_vtx_semantic[2] = 2;
|
|
// aFog
|
|
vsh->regs.sq_vtx_semantic[3] = 3;
|
|
// aGrayscaleColor
|
|
vsh->regs.sq_vtx_semantic[4] = 4;
|
|
// aInput1
|
|
vsh->regs.sq_vtx_semantic[5] = 5;
|
|
// aInput2
|
|
vsh->regs.sq_vtx_semantic[6] = 6;
|
|
// aInput3
|
|
vsh->regs.sq_vtx_semantic[7] = 7;
|
|
// aInput4
|
|
vsh->regs.sq_vtx_semantic[8] = 8;
|
|
// aInput5
|
|
vsh->regs.sq_vtx_semantic[9] = 9;
|
|
// aInput6
|
|
vsh->regs.sq_vtx_semantic[10] = 10;
|
|
// aInput7
|
|
vsh->regs.sq_vtx_semantic[11] = 11;
|
|
|
|
vsh->regs.vgt_vertex_reuse_block_cntl = 14; // vtx_reuse_depth
|
|
vsh->regs.vgt_hos_reuse_depth = 16; // reuse_depth
|
|
|
|
// program
|
|
vsh->program = program_buf;
|
|
vsh->size = program_size;
|
|
|
|
vsh->mode = GX2_SHADER_MODE_UNIFORM_REGISTER;
|
|
|
|
// attribs
|
|
vsh->attribVarCount = sizeof(attribVars) / sizeof(GX2AttribVar);
|
|
vsh->attribVars = attribVars;
|
|
|
|
return 0;
|
|
}
|
|
#undef ADD_INSTR
|
|
|
|
int gx2GenerateShaderGroup(struct ShaderGroup *group, struct CCFeatures *cc_features) {
|
|
memset(group, 0, sizeof(struct ShaderGroup));
|
|
|
|
// generate the pixel shader
|
|
if (generatePixelShader(&group->pixelShader, cc_features) != 0) {
|
|
gx2FreeShaderGroup(group);
|
|
return -1;
|
|
}
|
|
|
|
// generate the vertex shader
|
|
if (generateVertexShader(&group->vertexShader, cc_features) != 0) {
|
|
gx2FreeShaderGroup(group);
|
|
return -1;
|
|
}
|
|
|
|
uint32_t attribOffset = 0;
|
|
|
|
// aVtxPos
|
|
group->attributes[group->numAttributes++] =
|
|
(GX2AttribStream) { 0, 0, attribOffset, GX2_ATTRIB_FORMAT_FLOAT_32_32_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _z, _w), GX2_ENDIAN_SWAP_DEFAULT };
|
|
attribOffset += 4 * sizeof(float);
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
if (cc_features->used_textures[i]) {
|
|
// aTexCoordX
|
|
group->attributes[group->numAttributes++] =
|
|
(GX2AttribStream) { 1 + i, 0, attribOffset, GX2_ATTRIB_FORMAT_FLOAT_32_32_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _z, _w), GX2_ENDIAN_SWAP_DEFAULT };
|
|
attribOffset += 4 * sizeof(float);
|
|
}
|
|
}
|
|
|
|
// aFog
|
|
if (cc_features->opt_fog) {
|
|
group->attributes[group->numAttributes++] =
|
|
(GX2AttribStream) { 3, 0, attribOffset, GX2_ATTRIB_FORMAT_FLOAT_32_32_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _z, _w), GX2_ENDIAN_SWAP_DEFAULT };
|
|
attribOffset += 4 * sizeof(float);
|
|
}
|
|
|
|
// aGrayscaleColor
|
|
if (cc_features->opt_grayscale) {
|
|
group->attributes[group->numAttributes++] =
|
|
(GX2AttribStream) { 4, 0, attribOffset, GX2_ATTRIB_FORMAT_FLOAT_32_32_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _z, _w), GX2_ENDIAN_SWAP_DEFAULT };
|
|
attribOffset += 4 * sizeof(float);
|
|
}
|
|
|
|
// aInput
|
|
for (int i = 0; i < cc_features->num_inputs; i++) {
|
|
group->attributes[group->numAttributes++] =
|
|
(GX2AttribStream) { 5 + i, 0, attribOffset, GX2_ATTRIB_FORMAT_FLOAT_32_32_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _z, _w), GX2_ENDIAN_SWAP_DEFAULT };
|
|
attribOffset += 4 * sizeof(float);
|
|
}
|
|
|
|
group->stride = attribOffset;
|
|
|
|
// init the fetch shader
|
|
group->fetchShader.size = GX2CalcFetchShaderSizeEx(group->numAttributes, GX2_FETCH_SHADER_TESSELLATION_NONE, GX2_TESSELLATION_MODE_DISCRETE);
|
|
group->fetchShader.program = memalign(GX2_SHADER_PROGRAM_ALIGNMENT, group->fetchShader.size);
|
|
if (!group->fetchShader.program) {
|
|
gx2FreeShaderGroup(group);
|
|
return -1;
|
|
}
|
|
|
|
GX2InitFetchShaderEx(&group->fetchShader, group->fetchShader.program, group->numAttributes, group->attributes, GX2_FETCH_SHADER_TESSELLATION_NONE, GX2_TESSELLATION_MODE_DISCRETE);
|
|
|
|
// invalidate all programs
|
|
GX2Invalidate(GX2_INVALIDATE_MODE_CPU_SHADER, group->vertexShader.program, group->vertexShader.size);
|
|
GX2Invalidate(GX2_INVALIDATE_MODE_CPU_SHADER, group->pixelShader.program, group->pixelShader.size);
|
|
GX2Invalidate(GX2_INVALIDATE_MODE_CPU_SHADER, group->fetchShader.program, group->fetchShader.size);
|
|
|
|
return 0;
|
|
}
|
|
|
|
void gx2FreeShaderGroup(struct ShaderGroup *group) {
|
|
free(group->vertexShader.program);
|
|
free(group->pixelShader.program);
|
|
free(group->fetchShader.program);
|
|
}
|
|
|
|
#endif
|