The tutorial https://nitish2112.github.io/post/adding-instruction-riscv/ teaches us how to add a simple modulo
instruction
mod r1, r2, r3
Semantics:
R[r1] = R[r2] % R[r3]
modify arch/riscv/decoder.isa
, add mod
in format ROp
0x33: decode FUNCT3 {
format ROp {
0x0: decode FUNCT7 {
0x0: add({{
Rd = Rs1_sd + Rs2_sd;
}});
0x1: mul({{
Rd = Rs1_sd * Rs2_sd;
}}, IntMultOp);
0x10: mod({{
Rd = Rs1_sd % Rs2_sd;
}});
0x20: sub({{
Rd = Rs1_sd - Rs2_sd;
}});
}
- install riscv-tools
$ git clone https://github.com/riscv/riscv-tools.git
$ git submodule update --init --recursive
$ export RISCV=/path/to/install/riscv/toolchain
$ ./build.sh
- modify
riscv-opcodes/opcodes
sra rd rs1 rs2 31..25=32 14..12=5 6..2=0x0C 1..0=3
or rd rs1 rs2 31..25=0 14..12=6 6..2=0x0C 1..0=3
and rd rs1 rs2 31..25=0 14..12=7 6..2=0x0C 1..0=3
mod rd rs1 rs2 31..25=1 14..12=0 6..2=0x1A 1..0=3
addiw rd rs1 imm12 14..12=0 6..2=0x06 1..0=3
slliw rd rs1 31..25=0 shamtw 14..12=1 6..2=0x06 1..0=3
srliw rd rs1 31..25=0 shamtw 14..12=5 6..2=0x06 1..0=3
sraiw rd rs1 31..25=32 shamtw 14..12=5 6..2=0x06 1..0=3
- add MATCH and MASK to
riscv-gnu-toolchain/riscv-binutils-gdb/include/opcode/riscv-opc.h
#define MATCH_MOD 0x200006b
#define MASK_MOD 0xfe00707f
- modify
riscv-gnu-toolchain/riscv-binutils-gdb/opcodes/riscv-opc.c
const struct riscv_opcode riscv_opcodes[] =
{
/* name, isa, operands, match, mask, match_func, pinfo. */
{"unimp", "C", "", 0, 0xffffU, match_opcode, 0 },
{"unimp", "I", "", MATCH_CSRRW | (CSR_CYCLE << OP_SH_CSR), 0xffffffffU, match_opcode, 0 }, /* csrw cycle, x0 */
{"ebreak", "C", "", MATCH_C_EBREAK, MASK_C_EBREAK, match_opcode, INSN_ALIAS },
{"ebreak", "I", "", MATCH_EBREAK, MASK_EBREAK, match_opcode, 0 },
{"sbreak", "C", "", MATCH_C_EBREAK, MASK_C_EBREAK, match_opcode, INSN_ALIAS },
{"sbreak", "I", "", MATCH_EBREAK, MASK_EBREAK, match_opcode, INSN_ALIAS },
....
....
....
{"mod", "I", "d,s,t", MATCH_MOD, MASK_MOD, match_opcode, 0 }
....
....
prefetch.i
(omit in this example)prefetch.r
prefetch.w
- add
src/arch/riscv/isa/formats/cbo.isa
: define prefetch instruction processing template
def template PrefetchDeclare {{
/**
* Static instruction class for "%(mnemonic)s".
*/
class %(class_name)s : public %(base_class)s
{
private:
%(reg_idx_arr_decl)s;
public:
/// Constructor.
%(class_name)s(ExtMachInst machInst);
Fault execute(ExecContext *, Trace::InstRecord *) const override;
Fault initiateAcc(ExecContext *, Trace::InstRecord *) const override;
Fault completeAcc(PacketPtr, ExecContext *,
Trace::InstRecord *) const override;
};
}};
def template PrefetchConstructor {{
%(class_name)s::%(class_name)s(ExtMachInst machInst):
%(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
{
%(set_reg_idx_arr)s;
%(constructor)s;
%(offset_code)s;
}
}};
def template PrefetchExecute {{
Fault
%(class_name)s::execute(ExecContext *xc,
Trace::InstRecord *traceData) const
{
Addr EA;
%(op_decl)s;
%(op_rd)s;
%(ea_code)s;
readMemAtomicLE(xc, traceData, EA, Mem, memAccessFlags);
return NoFault;
}
}};
def template PrefetchInitiateAcc {{
Fault
%(class_name)s::initiateAcc(ExecContext *xc,
Trace::InstRecord *traceData) const
{
Addr EA;
%(op_decl)s;
%(op_rd)s;
%(ea_code)s;
return initiateMemRead(xc, traceData, EA, Mem, memAccessFlags);
}
}};
def template PrefetchCompleteAcc {{
Fault
%(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
Trace::InstRecord *traceData) const
{
return NoFault;
}
}};
def format PrefetchOp(memacc_code, mem_flags=[], inst_flags=[]) {{
mem_flags = makeList(mem_flags)
inst_flags = makeList(inst_flags)
offset_code = """offset = sext<12>(IMM5 | (IMM7 << 5));"""
ea_code = """EA = Rs1 + offset;"""
memacc_code = """Mem_sw"""
iop = InstObjParams(name, Name, 'PrefetchOp',
{'memacc_code': memacc_code, 'offset_code': offset_code,
'ea_code': ea_code}, inst_flags)
if mem_flags:
mem_flags = [ 'Request::%s' % flag for flag in mem_flags ]
s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
iop.constructor += s
fullExecTemplate = eval('Prefetch' + 'Execute')
initiateAccTemplate = eval('Prefetch' + 'InitiateAcc')
completeAccTemplate = eval('Prefetch' + 'CompleteAcc')
header_output = PrefetchDeclare.subst(iop)
decoder_output = PrefetchConstructor.subst(iop)
decode_block = BasicDecode.subst(iop)
exec_output = fullExecTemplate.subst(iop)+initiateAccTemplate.subst(iop)+completeAccTemplate.subst(iop)
}};
- include "cbo.isa" in
src/arch/riscv/isa/formats/formats.isa
##include "cbo.isa"
- define PrefetchOp
add
src/arch/riscv/insts/cbo.hh
#ifndef __ARCH_RISCV_INST_CBO_HH__
#define __ARCH_RISCV_INST_CBO_HH__
#include <string>
#include "arch/riscv/insts/mem.hh"
#include "arch/riscv/insts/static_inst.hh"
#include "cpu/exec_context.hh"
#include "cpu/static_inst.hh"
namespace gem5
{
namespace RiscvISA
{
class PrefetchOp : public MemInst
{
protected:
using MemInst::MemInst;
std::string generateDisassembly(
Addr pc, const loader::SymbolTable *symtab) const override;
};
} // namespace RiscvISA
} // namespace gem5
#endif // __ARCH_RISCV_INST_CBO_HH__
add src/arch/riscv/insts/cbo.cc
#include "arch/riscv/insts/cbo.hh"
#include <sstream>
#include <string>
#include "arch/riscv/insts/bitfields.hh"
#include "arch/riscv/insts/static_inst.hh"
#include "arch/riscv/utility.hh"
#include "cpu/static_inst.hh"
namespace gem5
{
namespace RiscvISA
{
std::string
PrefetchOp::generateDisassembly(Addr pc,
const loader::SymbolTable *symtab) const
{
std::stringstream ss;
ss << mnemonic << ' ' <<
offset << '(' << registerName(srcRegIdx(0)) << ')';
return ss.str();
}
} // namespace RiscvISA
} // namespace gem5
modify src/arch/riscv/insts/SConscript
: add
Source('cbo.cc', tags='riscv isa')
For simplicity, in src/arch/riscv/isa/bitfields.isa
, add PFTYPE fields to figure out prefetch types: prefetch.i
or prefetch.r
or prefetch.w
// Prefetch
def bitfield PFTYPE <24:20>;
in src/arch/riscv/isa/decoder.isa
decode QUADRANT default Unknown::unknown() {
0x03: decode OPCODE {
…
0x04: decode FUNCT3 {
0x6: decode RD {
0x00: decode PFTYPE {
format PrefetchOp {
0x0: Prefetch_i({{
}}, inst_flags=IsInstPrefetch, mem_flags=PREFETCH);
0x1: Prefetch_r({{
}}, inst_flags=IsDataPrefetch, mem_flags=PREFETCH);
0x3: Prefetch_w({{
}}, inst_flags=IsDataPrefetch, mem_flags=PF_EXCLUSIVE);
}
}
default: IOp::ori({{
Rd = Rs1 | imm;
}}, uint64_t);
}
}
in riscv-binutils/include/opcode/riscv-opc.h
#define MATCH_PREFETCH_I 0x6013
#define MASK_PREFETCH_I 0x1f07fff
#define MATCH_PREFETCH_R 0x106013
#define MASK_PREFETCH_R 0x1f07fff
#define MATCH_PREFETCH_W 0x306013
#define MASK_PREFETCH_W 0x1f07fff
DECLARE_INSN(prefetch_i, MATCH_PREFETCH_I, MASK_PREFETCH_I)
DECLARE_INSN(prefetch_r, MATCH_PREFETCH_R, MASK_PREFETCH_R)
DECLARE_INSN(prefetch_w, MATCH_PREFETCH_W, MASK_PREFETCH_W)
in riscv-binutils/include/opcode/riscv-opc.c
const struct riscv_opcode riscv_opcodes[] =
{
{"prefetch.i",64, INSN_CLASS_I, "q(s)", MATCH_PREFETCH_I, MASK_PREFETCH_I, match_opcode, 0 },
{"prefetch.r",64, INSN_CLASS_I, "q(s)", MATCH_PREFETCH_R, MASK_PREFETCH_R, match_opcode, 0 },
{"prefetch.w",64, INSN_CLASS_I, "q(s)", MATCH_PREFETCH_W, MASK_PREFETCH_W, match_opcode, 0 },
...
}
write a sample code
int main() {
int a;
asm volatile ("prefetch.r %[addr]": : [addr]"A"(a));
}
Compiler produces:
0000000000010146 <main>:
10146: 1101 addi sp,sp,-32
10148: ec22 sd s0,24(sp)
1014a: 1000 addi s0,sp,32
1014c: fec40793 addi a5,s0,-20
10150: 0017e013 prefetch.r 0(a5)
10154: 4781 li a5,0
10156: 853e mv a0,a5
10158: 6462 ld s0,24(sp)
1015a: 6105 addi sp,sp,32
1015c: 8082 ret
Successfully generated prefetch instructions.
use gdb add breakpoints
b Prefetch_r::initiateAcc
b Prefetch_r::completeAcc
#include <stdlib.h>
#include <gem5/m5ops.h>
const int MATRIX_SIZE = 100;
int main() {
int** matA = (int**)malloc(MATRIX_SIZE * sizeof(int*));
int** matB = (int**)malloc(MATRIX_SIZE * sizeof(int*));
int** matC = (int**)malloc(MATRIX_SIZE * sizeof(int*));
for(int i = 0; i < MATRIX_SIZE; i++) {
matA[i] = (int*)malloc(MATRIX_SIZE * sizeof(int));
matB[i] = (int*)malloc(MATRIX_SIZE * sizeof(int));
matC[i] = (int*)malloc(MATRIX_SIZE * sizeof(int));
}
m5_reset_stats(0, 0);
for(int i = 0; i < MATRIX_SIZE; i++) {
for(int j = 0; j < MATRIX_SIZE; j++) {
asm volatile ("prefetch.r %[addr]":
: [addr]"A"(matA[i][j + stride]));
asm volatile ("prefetch.r %[addr]":
: [addr]"A"(matB[i][j + stride]));
asm volatile ("prefetch.w %[addr]":
: [addr]"A"(matC[i][j + stride]));
matC[i][j] = matA[i][j] + matB[i][j];
}
}
m5_dump_stats(0, 0);
...
}
Test results is shown in the following table:
Settings | Clock Cycles |
---|---|
No Prefetch | 260038 |
Stride 16 / 3 prefetch | 195276 |
Stride 16 / 2 prefetch (no pf.w) | 173609 |
Stride 16/ 1 prefetch | 214953 |
Stride 8/3 prefetch | 195229 |
Stride 8/2 prefetch (no pf.w) | 180388 |
Stride 8/ 1 prefetch | 220065 |
Stride 1/3 prefetch | 389572 |
Stride 1/ 2 Prefetch (no pf.w) | 292908 |
Stride 1/ 1 Prefetch | 389572 |