diff --git a/Images/BranchPredictor.png b/Images/BranchPredictor.png new file mode 100644 index 0000000..03641dc Binary files /dev/null and b/Images/BranchPredictor.png differ diff --git a/TODO.org b/TODO.org index 372eb77..e69de29 100644 --- a/TODO.org +++ b/TODO.org @@ -1,57 +0,0 @@ -* Tasks -** DONE File IO and test -** DONE Stop exploding the heap with logs :DDD -** DONE Fix DONE instruction for VM termination -*** DONE Add setting instructions -** DONE Add assembler -** DONE Chisel tester -** DONE Add LF -** DONE Redo colors in fansi. ANSI fucks up string formatting -** DONE Columnize log events -** DONE Chisel test log evaluator -** DONE Create giftWrapper script -** DONE Better sourceinfo stuff - Good enough - -** DONE Test options -*** DONE How much NOP pad? -*** DONE Verbosity? -*** DONE Which tests? -** DONE ish Step counter, pretty print VM log, including final memory state -** TODO More programs -*** DONE Real programs -*** TODO Basic programs - Needs more -** DONE Merge in LF changes -** TODO Breakpoints -*** TODO VM breakpoints -**** TODO Record breakpoints in chisel tester -*** TODO Chisel breakpoints -**** TODO Freeze processor to record state -**** TODO Record breakpoints in chisel tester -*** TODO Draw breakpoints in the printer -** TODO Calculate steps needed -** TODO Unmangle derailed traces - With incorrect designs the trace printer ends up printing a lot of diveregent - unsychnronizable blocks -** DONE Fix DONE instruction -*** DONE Parse error -*** DONE Use DONE address -** DONE Hazard generator - good enough -** TODO Semantic logging - Currently logging is quite awkward, a combination of fansi and regular strings. - Ideally a markdown format such as HTML should be used. There are already plenty - good scala libraries for this, such as liyaohi's stuff (big shoutout!) - -** TODO Interactive stepping - This one is a pretty big undertaking, but it could be very useful to run the circuit in an interactiv - environment. - https://venus.cs61c.org/ is a good example of how useful this can be for a virtual machine. - This task requires pretty good understanding of chisel. -* Maybe -** DONE Move instruction recording to IMEM rather than IF? - Only care about what IF gets, won't have to deal with whatever logic is in IF. -** DONE Figure out why loading instructions backwards made shit werk - Not as funny as you'd think. The issue was overwriting the last written instruction with 0 - diff --git a/branchProfiler.scala b/branchProfiler.scala new file mode 100644 index 0000000..e69de29 diff --git a/src/main/scala/main.scala b/src/main/scala/main.scala new file mode 100644 index 0000000..ff7d900 --- /dev/null +++ b/src/main/scala/main.scala @@ -0,0 +1,7 @@ +package FiveStage + +object main { + def main(args: Array[String]): Unit = { + println("helo") + } +} diff --git a/src/test/resources/tests/programs/constants.s b/src/test/resources/tests/programs/constants.s new file mode 100644 index 0000000..e7eb6c4 --- /dev/null +++ b/src/test/resources/tests/programs/constants.s @@ -0,0 +1,15 @@ +main: + li x0, 0x0 + nop + li x1, 0xABCDEF0 + nop + li x1, 32 + li x1, 0x800 + li x1, 0x7FF + nop + nop + done +#regset t0,10 +#regset t1,23 +#regset t2,43 +#regset t3,-11 diff --git a/src/test/resources/tests/programs/halfwords.s b/src/test/resources/tests/programs/halfwords.s new file mode 100644 index 0000000..c87612d --- /dev/null +++ b/src/test/resources/tests/programs/halfwords.s @@ -0,0 +1,18 @@ +main: + li x1, 0x11223344 + li x2, 0x55667788 + li x10, 0x100 + sw x1, 0(x10) + sw x2, 4(x10) + lw x3, 2(x10) + lh x4, 3(x10) + lb x5, 3(x10) + lhu x6, 3(x10) + lbu x7, 3(x10) + sw x1, 8(x10) + sw x1, 9(x10) + sh x2, 9(x10) + sb x2, 11(x10) + lw x12, 8(x10) + lw x13, 12(x10) + done diff --git a/src/test/scala/Manifest.scala b/src/test/scala/Manifest.scala index 4298317..db63ac2 100644 --- a/src/test/scala/Manifest.scala +++ b/src/test/scala/Manifest.scala @@ -1,4 +1,5 @@ package FiveStage + import org.scalatest.{Matchers, FlatSpec} import cats._ import cats.implicits._ @@ -20,7 +21,7 @@ object Manifest { val singleTest = "forward2.s" - val nopPadded = true + val nopPadded = false val singleTestOptions = TestOptions( printIfSuccessful = true, @@ -53,18 +54,15 @@ object Manifest { class ProfileBranching extends FlatSpec with Matchers { it should "profile some branches" in { - TestRunner.profileBranching( - Manifest.singleTestOptions.copy(testName = "branchProfiling.s", maxSteps = 50000) + BranchProfiler.profileBranching( + Manifest.singleTestOptions.copy(testName = "branchProfiling.s", maxSteps = 150000) ) should be(true) } } class ProfileCache extends FlatSpec with Matchers { it should "profile a cache" in { - say("Warning, this test takes forever to run! 2 minutes on my machine at least.") - say("This happens due to the less than optimal way of storing the update log. Sorry I guess") - say("You probably want to debug this with a smaller program") - TestRunner.profileCache( + CacheProfiler.profileCache( Manifest.singleTestOptions.copy(testName = "convolution.s", maxSteps = 150000) ) should be(true) } diff --git a/src/test/scala/RISCV/DataTypes.scala b/src/test/scala/RISCV/DataTypes.scala index 14d6c61..c6c41d3 100644 --- a/src/test/scala/RISCV/DataTypes.scala +++ b/src/test/scala/RISCV/DataTypes.scala @@ -176,8 +176,16 @@ object Data { val bitsRight = 32 - slots.log2 val leftShifted = i << bitsLeft val rightShifted = leftShifted >>> bitsRight - // say(i) - // say(rightShifted) + rightShifted + } + + // To get the entire word call with from = 31, to = 0 + def bits(from: Int, to: Int): Int = { + val bitsLeft = 31 - from + val bitsRight = bitsLeft + to + val leftShifted = i << bitsLeft + val rightShifted = leftShifted >>> bitsRight + rightShifted } } diff --git a/src/test/scala/RISCV/branchProfiler.scala b/src/test/scala/RISCV/branchProfiler.scala new file mode 100644 index 0000000..20d4235 --- /dev/null +++ b/src/test/scala/RISCV/branchProfiler.scala @@ -0,0 +1,147 @@ +package FiveStage + +import org.scalatest.{Matchers, FlatSpec} +import cats._ +import cats.implicits._ +import fileUtils._ + +import chisel3.iotesters._ +import scala.collection.mutable.LinkedHashMap + +import fansi.Str + +import Ops._ +import Data._ +import VM._ + +import PrintUtils._ +import LogParser._ + +object BranchProfiler { + + def profileBranching(testOptions: TestOptions): Boolean = { + + val testResults = for { + lines <- fileUtils.readTest(testOptions) + program <- FiveStage.Parser.parseProgram(lines, testOptions) + (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) + } yield { + + sealed trait BranchEvent + case class Taken(from: Int, to: Int) extends BranchEvent { override def toString = s"Taken ${from.hs}\t${to.hs}" } + case class NotTaken(addr: Int) extends BranchEvent { override def toString = s"Not Taken ${addr.hs}" } + + val events: List[BranchEvent] = trace.flatMap(_.event).collect{ + case PcUpdateBranch(from, to) => Taken(from.value, to.value) + case PcUpdateNoBranch(at) => NotTaken(at.value) + } + + + /** + * This is a sample profiler for a rather unrealistic branch predictor which has an unlimited amount + * of slots + */ + def OneBitInfiniteSlots(events: List[BranchEvent]): Int = { + + // Uncomment to take a look at the event log + // say(events.mkString("\n","\n","\n")) + + // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated + // to reflect this. + // As long as there are remaining events the helper calls itself recursively on the remainder + def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = { + events match { + + // Scala syntax for matching a list with a head element of some type and a tail + // `case h :: t =>` + // means we want to match a list with at least a head and a tail (tail can be Nil, so we + // essentially want to match a list with at least one element) + // h is the first element of the list, t is the remainder (which can be Nil, aka empty) + + // `case Constructor(arg1, arg2) :: t => ` + // means we want to match a list whose first element is of type Constructor, giving us access to its internal + // values. + + // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))` + // means we want to match a list whose first element is of type Constructor while satisfying some predicate p, + // called an if guard. + case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable) + case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true)) + case NotTaken(addr) :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false)) + case NotTaken(addr) :: t if(!predictionTable(addr)) => helper(t, predictionTable) + case Nil => 0 + } + } + + // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken + def initState = events.map{ + case Taken(from, addr) => (from, false) + case NotTaken(addr) => (addr, false) + }.toMap + + helper(events, initState) + } + + + def twoBitPredictor(events: List[BranchEvent], slots: Int): Int = { + + case class nBitPredictor( + values : List[Int], + predictionRules : List[Boolean], + transitionRules : Int => Boolean => Int, + ){ + val slots = values.size + + def predict(pc: Int): Boolean = predictionRules(values(pc.getTag(slots))) + + def update(pc: Int, taken: Boolean): nBitPredictor = { + val current = values(pc.getTag(slots)) + val next = copy(values = values.updated(pc.getTag(slots), transitionRules(current)(taken))) + next + } + + override def toString = values.map(x => x.binary(2)).mkString("[","][","]") + } + + val initPredictor = nBitPredictor( + List.fill(slots)(0), + List( + false, + false, + true, + true, + ), + r => r match { + case 0 => taken => if(taken) 1 else 0 + case 1 => taken => if(taken) 2 else 0 + case 2 => taken => if(taken) 3 else 1 + case 3 => taken => if(taken) 3 else 2 + } + ) + + events.foldLeft((0, initPredictor)){ case(((acc, bp), event)) => + println() + say(s"total misses: $acc") + say(event) + event match { + case Taken(pc, _) => say(s"taken at tag: ${pc.getTag(slots)}") + case NotTaken(pc) => say(s"not taken at tag: ${pc.getTag(slots)}") + } + say(bp) + event match { + case Taken(pc, _) if bp.predict(pc) => {say("HIT!"); (acc, bp.update(pc, true))} + case Taken(pc, _) => {say("MISS!"); (acc + 1, bp.update(pc, true))} + case NotTaken(pc) if !bp.predict(pc) => {say("HIT!"); (acc, bp.update(pc, false))} + case NotTaken(pc) => {say("MISS!"); (acc + 1, bp.update(pc, false))} + } + }._1 + } + + say(events.mkString("\n","\n","\n")) + say(twoBitPredictor(events, 8)) + } + + + true + } +} diff --git a/src/test/scala/RISCV/cacheProfiler.scala b/src/test/scala/RISCV/cacheProfiler.scala new file mode 100644 index 0000000..1765626 --- /dev/null +++ b/src/test/scala/RISCV/cacheProfiler.scala @@ -0,0 +1,204 @@ +package FiveStage + +import org.scalatest.{Matchers, FlatSpec} +import cats._ +import cats.implicits._ +import fileUtils._ + +import chisel3.iotesters._ +import scala.collection.mutable.LinkedHashMap + +import fansi.Str + +import Ops._ +import Data._ +import VM._ + +import PrintUtils._ +import LogParser._ + +object CacheProfiler { + + def profileCache(testOptions: TestOptions): Boolean = { + + val testResults = for { + lines <- fileUtils.readTest(testOptions) + program <- FiveStage.Parser.parseProgram(lines, testOptions) + (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) + } yield { + + import TestUtils._ + + sealed trait MemoryEvent + case class Write(addr: Int) extends MemoryEvent + case class Read(addr: Int) extends MemoryEvent + + val events = trace.flatMap(_.event).collect{ + case MemWrite(addr, _) => Write(addr.value) + case MemRead(addr, _) => Read(addr.value) + } + + + class CacheProfiler(setCount: Int, setSize: Int, blockSize: Int){ + + // If we set counter to 0 we risk evicting the first allocated block. + var counter = 1 + var misses = 0 + var mostRecent = 0 + var wasMiss = false + + implicit class AddrOps(i: Int){ + val blockOffsetBits = blockSize.log2 + val lineBits = setSize.log2 + + def lineIdx: Int = { + i.bits(2 + blockOffsetBits + lineBits - 1, 2 + blockOffsetBits) + } + } + + + case class CacheLine(tag: Int, lastUsed: Int){ + def matches(addr: Int): Boolean = List.fill(blockSize)(tag) + .zipWithIndex + .map{ case(btag, idx) => btag + idx*4 } + .map(_ == addr) + .foldLeft(false)(_ || _) + + def renderContent(addr: Int): String = (addr == mostRecent, wasMiss) match { + case (true, true) => Console.RED + addr.hs + Console.RESET + case (true, false) => Console.GREEN + addr.hs + Console.RESET + case _ => addr.hs + } + + def render: String = { + val blockContents = List.fill(blockSize)(tag) + .zipWithIndex + .map{ case(btag, idx) => renderContent(btag + idx*4) } + .mkString("Contents: || ", " | ", " |") + + s"Base: ${tag.hs} LRU: $lastUsed\t" + blockContents + } + } + object CacheLine { + def truncateTag(addr: Int) = addr - (addr % (blockSize*4)) + } + + + case class CacheSet(blocks: Array[CacheLine]){ + def lineIdx(addr: Int): Int = addr.lineIdx + def contains(addr: Int): Boolean = blocks.map(_.matches(addr)).foldLeft(false)(_ || _) + + def updateLRU(addr: Int): Unit = { + val idx = lineIdx(addr) + val next = blocks(idx).copy(lastUsed = counter) + blocks(idx) = next + } + + def render: String = { + blocks.map(_.render).mkString("\n", "\n", "\n") + } + } + + + case class Cache(sets: Array[CacheSet]){ + + /** returns the index of set if hit */ + def checkHit(addr: Int): Option[Int] = sets + .zipWithIndex + .map{ case(set, idx) => Option.when(set.contains(addr))(idx) } + .flatten.headOption + + + /** Updates the LRU counter */ + def updateLRU(addr: Int, setIdx: Int): Unit = sets(setIdx).updateLRU(addr) + + + /** Gets set with least recently used */ + def getLRU(addr: Int): Int = sets + .map( set => set.blocks(set.lineIdx(addr)).lastUsed) + .zipWithIndex + .sortBy(_._1) + .map(_._2) + .head + + + /** Entry point */ + def handleAccess(addr: Int): Unit = { + mostRecent = addr + counter += 1 + + checkHit(addr) match { + + case Some(setIdx) => { + wasMiss = false + updateLRU(addr, setIdx) + // say(s"${addr.hs} HIT") + } + + case None => { + val set = sets(getLRU(addr)) + val nextTag = CacheLine.truncateTag(addr) + set.blocks(set.lineIdx(addr)) = set.blocks(set.lineIdx(addr)).copy( + tag = nextTag, + lastUsed = counter + ) + misses += 1 + + wasMiss = true + // say(s"${addr.hs} MISS") + // say(s"BLOCK ${addr.lineIdx} IN SET ${getLRU(addr)} EVICTED. BYE BYE") + } + } + } + + /** Pretty pictures! */ + def render: String = { + sets.map(_.render).mkString("\n", "\n", "\n") + } + } + + object Cache { + def init: Cache = Cache(Array.fill(setCount)( + CacheSet(Array.fill(setSize)(CacheLine(57005, 0)))) + ) + } + } + + for{ + sets <- List(2, 4, 8) + blockSize <- List(4, 8) + lines <- List(2, 4, 8) + } yield { + + val myTest = new CacheProfiler(sets, lines, blockSize) + val myCache = myTest.Cache.init + events.foreach{ + case Write(addr) => myCache.handleAccess(addr) + case Read(addr) => myCache.handleAccess(addr) + } + + say(s"sets: $sets, lines: $lines, blockSize: $blockSize yields ${myTest.misses} misses") + } + + // val myTest = new CacheProfiler(2, 4, 4) + // val myCache = myTest.Cache.init + // events.foreach{ + // case Write(addr) => { + // say(addr.hs) + // myCache.handleAccess(addr) + // say(myCache.render) + // } + // case Read(addr) => { + // say(addr.hs) + // myCache.handleAccess(addr) + // say(myCache.render) + // } + // } + + // say(myTest.misses) + + } + + true + } +} diff --git a/src/test/scala/RISCV/testRunner.scala b/src/test/scala/RISCV/testRunner.scala index 3d9d60c..f59dcf6 100644 --- a/src/test/scala/RISCV/testRunner.scala +++ b/src/test/scala/RISCV/testRunner.scala @@ -101,98 +101,4 @@ object TestRunner { successful }.toOption.getOrElse(false) } - - def profileBranching(testOptions: TestOptions): Boolean = { - - val testResults = for { - lines <- fileUtils.readTest(testOptions) - program <- FiveStage.Parser.parseProgram(lines, testOptions) - (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) - } yield { - - sealed trait BranchEvent - case class Taken(from: Int, to: Int) extends BranchEvent { override def toString = s"Taken ${from.hs}\t${to.hs}" } - case class NotTaken(addr: Int) extends BranchEvent { override def toString = s"Not Taken ${addr.hs}" } - - val events: List[BranchEvent] = trace.flatMap(_.event).collect{ - case PcUpdateBranch(from, to) => Taken(from.value, to.value) - case PcUpdateNoBranch(at) => NotTaken(at.value) - } - - - /** - * This is a sample profiler for a rather unrealistic branch predictor which has an unlimited amount - * of slots - */ - def OneBitInfiniteSlots(events: List[BranchEvent]): Int = { - - // Uncomment to take a look at the event log - // say(events.mkString("\n","\n","\n")) - - // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated - // to reflect this. - // As long as there are remaining events the helper calls itself recursively on the remainder - def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = { - events match { - - // Scala syntax for matching a list with a head element of some type and a tail - // `case h :: t =>` - // means we want to match a list with at least a head and a tail (tail can be Nil, so we - // essentially want to match a list with at least one element) - // h is the first element of the list, t is the remainder (which can be Nil, aka empty) - - // `case Constructor(arg1, arg2) :: t => ` - // means we want to match a list whose first element is of type Constructor, giving us access to its internal - // values. - - // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))` - // means we want to match a list whose first element is of type Constructor while satisfying some predicate p, - // called an if guard. - case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable) - case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true)) - case NotTaken(addr) :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false)) - case NotTaken(addr) :: t if(!predictionTable(addr)) => helper(t, predictionTable) - case Nil => 0 - } - } - - // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken - def initState = events.map{ - case Taken(from, addr) => (from, false) - case NotTaken(addr) => (addr, false) - }.toMap - - helper(events, initState) - } - - say(OneBitInfiniteSlots(events)) - } - - - true - } - - - def profileCache(testOptions: TestOptions): Boolean = { - - val testResults = for { - lines <- fileUtils.readTest(testOptions) - program <- FiveStage.Parser.parseProgram(lines, testOptions) - (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) - } yield { - - sealed trait MemoryEvent - case class Write(addr: Int) extends MemoryEvent - case class Read(addr: Int) extends MemoryEvent - - val events: List[MemoryEvent] = trace.flatMap(_.event).collect{ - case MemWrite(x,_) => Write(x.value) - case MemRead(x,_) => Read(x.value) - } - - // Your cache here - - } - true - } } diff --git a/src/test/scala/TestUtils.scala b/src/test/scala/TestUtils.scala index acaa741..c1a0f94 100644 --- a/src/test/scala/TestUtils.scala +++ b/src/test/scala/TestUtils.scala @@ -6,6 +6,10 @@ import PrintUtils._ object TestUtils { + implicit class OptionBackport(t: Option.type){ + def when[T](b: Boolean)(t: => T) = if(b) Some(t) else None + } + /** * Generate and serialize BTrees for the test runner */ diff --git a/src/test/scala/chiselTestRunner.scala b/src/test/scala/chiselTestRunner.scala index cdd83a8..b244600 100644 --- a/src/test/scala/chiselTestRunner.scala +++ b/src/test/scala/chiselTestRunner.scala @@ -142,7 +142,7 @@ private class ChiselTestRunner ( // After finishing, let the circuit run until all updates can be committed. private def flush: List[CircuitTrace] = - (0 to 3).map(_ => stepOne).reverse.toList + (0 to 4).map(_ => stepOne).reverse.toList /** * Run the entire shebang diff --git a/theory2.org b/theory2.org index f98e836..a8b4848 100644 --- a/theory2.org +++ b/theory2.org @@ -1,36 +1,55 @@ -* Question 1 - Hazards - For the following programs describe each hazard with type (data or control), line number and a - small (max one sentence) description +* Question 0 - Testing hazards + This question is mandatory, but rewards no points (not directly at least). -** program 1 - #+begin_src asm - addi t0, zero, 10 - addi t1, zero, 20 - L2: - sub t1, t1, t0 - beq t1, zero, .L2 - jr ra - #+end_src + The tests found in the testing framework are useful for testing a fully working processor, however it + leaves much to be desired for when you actually want to design one from whole cloth. + + To rectify this, you should write some tests of your own that should serve as a minimal case for various + hazards that you will encounter. You do not need to deliver anything here, but I expect you to have + these tests if you ask me for help debugging your design during lab hours. + (You can of course come to lab hours if you're having trouble writing these tests) -** program 2 - #+begin_src asm - addi t0, zero, 10 - lw t0, 10(t0) - beq t0, zero, .L3 - jr ra - #+end_src +** Forwarding + The tests in forward1.s and forward2.s are automatically generated, long, and non-specific, + thus not very suited for debugging. + + You should write one (or more) test(s) that systematically expose your processor to dependency + hazards, including instructions that: + + Needs forwarding from MEM and WB (i.e dependencies with NOPs between them). + + Exposes results that should *not* be forwarded due to regWrite being false. + + Writes and reads to/from the zero register. -** program 3 - #+begin_src asm - lw t0, 0(t0) - lw t1, 4(t0) - sw t0, 8(t1) - lw t1, 12(t0) - beq t0, t1, .L3 - jr ra - #+end_src +** Load freezes + Loads freezes are tricky since they have an interaction with the forwarding unit, often causing + bugs that appear with low frequency in the supplied test programs. + + You should write tests (I suggest one test per case) that systematically expose your processor to + dependency hazards where one or more of the dependencies are memory accesses, including instructions that: + + Needs forwarding from MEM and WB where MEM, WB or both are load instructions. + + Exposes false dependencies from MEM and WB where one or more are loads. + For instance, consider ~addi x1, x1, 0x10~ in machine code with the rs2 field highlighted: + 0x00a08093 = 0b00000000 | 10100 | 0001000000010010011 + In this case there is a false dependency on x20 since x20 is only an artefact of the immediate + value which could cause an unecessary freeze. + + Writes and reads to/from the zero register, which could trigger an unecessary freeze + + Instructions that causes multiple freezes in a row. + + Instructions that causes multiple freezes in a row followed by an instruction with multiple + dependencies. + + +** Control hazards + There are a lot of possible interactions when jumping and branching, you need to write tests + that ensures that instructions are properly bubbled if they shouldn't have been fetched. + You should also test for interactions between forwarding and freezing here, i.e what happens + when the address calculation relies on forwarded values? What happens if the forwarded value + comes from a load instruction necessitating a freeze? + + +* TODO Question 1 - Hazards + Write programs here that are less of a crapshoot. Clarify dependency vs hazards etc etc and + *enforce* a format that is easy to grade. * Question 2 - Handling hazards @@ -39,7 +58,7 @@ ** Data hazards 1 At some cycle the following instructions can be found in a 5 stage design: - + #+begin_src text EX: || MEM: || WB: ---------------------||-------------------------||-------------------------- @@ -52,13 +71,17 @@ branch = false || branch = true || branch = false jump = false || jump = false || jump = false #+end_src - + For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2? - + Answer should be on the form: + + rs1: Narnia + rs2: Wikipedia + ** Data hazards 2 At some cycle the following instructions can be found in a 5 stage design: - + #+begin_src text EX: || MEM: || WB: ---------------------||-------------------------||-------------------------- @@ -73,11 +96,15 @@ #+end_src For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2? + Answer should be on the form: + + rs1: Random noise + rs2: WB (MEM if it's a tuesday) ** Data hazards 3 At some cycle the following instructions can be found in a 5 stage design: - + #+begin_src text EX: || MEM: || WB: ---------------------||-------------------------||-------------------------- @@ -89,24 +116,26 @@ memWrite = true || memWrite = false || memWrite = false branch = false || branch = false || branch = false jump = false || jump = false || jump = false - - Should the forwarding unit issue a load hazard signal? - (Hint: what are the semantics of the instruction currently in EX stage?) #+end_src + Should the forwarding unit issue a load hazard signal? *This is a yes/no question* + (Hint: what are the semantics of the instruction currently in EX stage?) + * Question 3 - Branch prediction - Consider a 2 bit branch predictor with only 4 slots for a 32 bit architecture (without BTB), where the decision to + Consider a 2 bit branch predictor with only 4 slots for a 32 bit architecture (without BTB), where the decision to take a branch or not is decided in accordance to the following table: #+begin_src text state || predict taken || next state if taken || next state if not taken || =======||=================||=======================||==========================|| 00 || NO || 01 || 00 || - 01 || NO || 10 || 00 || - 10 || YES || 11 || 01 || + 01 || NO || 11 || 00 || + 10 || YES || 11 || 00 || 11 || YES || 11 || 10 || #+end_src - - (This is known as a saturating 2bit counter, it is *not* the same scheme as in the lecture slides) + + Which corresponds to this figure: + #+CAPTION: FSM of a 2 bit branch predictor. Note that it is not a 2bit saturating counter. + [[./Images/BranchPredictor.png]] At some point during execution the program counter is ~0xc~ and the branch predictor table looks like this: #+begin_src text @@ -114,21 +143,34 @@ ======||======== 00 || 01 01 || 00 - 10 || 11 - 11 || 01 + 10 || 01 + 11 || 10 #+end_src For the following program: #+begin_src asm - 0xc addi x1, x3, 10 - 0x10 add x2, x1, x1 - 0x14 beq x1, x2, .L1 + .L1: + 0x0C addi x1, x1, 1 + 0x10 add x2, x2, x1 + 0x14 bge x2, x3, .L1 0x18 j .L2 + .L3: + 0x1C addi x2, x2, 0x10 + 0x20 slli x2, 0x4 + 0x24 jr ra #+end_src - - Will the predictor predict taken or not taken for the beq instruction? -* Question 4 - Benchmarking + At cycle 0 the state of the machine is as following: + #+begin_src text + PC = 0x0C + x1 = 0x0 + x2 = 0x0 + x3 = 0x7 + #+end_src + + At which cycle will the PC be 0x24 given a 2 cycle delay for mispredicts? + +* Question 4 - Benchmarking a branch profiler In order to gauge the performance increase from adding branch predictors it is necessary to do some testing. Rather than writing a test from scratch it is better to use the tester already in use in the test harness. When running a program the VM outputs a log of all events, including which branches have been taken and which @@ -148,7 +190,7 @@ To help you get started, I have provided you with much of the necessary code. In order to get an idea for how you should profile branch misses, consider the following profiler which calculates - misses for a processor with a branch predictor with a 1 bit predictor with infinite memory: + misses for a processor with a branch predictor with a 1 bit predictor with infinite slots: #+BEGIN_SRC scala def OneBitInfiniteSlots(events: List[BranchEvent]): Int = { @@ -172,11 +214,11 @@ // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))` // means we want to match a list whose first element is of type Constructor while satisfying some predicate p, // called an if guard. - case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable) - case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true)) - case NotTaken(addr) :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false)) - case NotTaken(addr) :: t if(!predictionTable(addr)) => helper(t, predictionTable) - case _ => 0 + case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable) + case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true)) + case NotTaken(addr) :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false)) + case NotTaken(addr) :: t if(!predictionTable(addr)) => helper(t, predictionTable) + case _ => 0 } } @@ -207,7 +249,7 @@ With a 2 bit 8 slot scheme, how many mispredicts will happen? Answer with a number. - + Hint: Use the getTag method defined on int (in DataTypes.scala) to get the tag for an address. #+BEGIN_SRC scala val slots = 8 @@ -221,7 +263,7 @@ say(0x1C5C.getTag(slots)) // prints 7 say(0x1C60.getTag(slots)) // prints 0 (thus conflicts with 0x1C40) #+END_SRC - + * Question 5 - Cache profiling Unlike our design which has a very limited memory pool, real designs have access to vast amounts of memory, offset @@ -231,11 +273,6 @@ In order to investigate how caches can alter performance it is therefore necessary to make some rather unrealistic assumptions to see how different cache schemes impacts performance. - We will therefore assume the following: - + Reads from main memory takes 5 cycles - + cache has a total storage of 8 words (256 bits) - + cache reads work as they do now (i.e no additional latency) - For this exercise you will write a program that parses a log of memory events, similar to previous task #+BEGIN_SRC scala sealed trait MemoryEvent @@ -246,32 +283,13 @@ def profile(events: List[MemoryEvent]): Int = ??? #+END_SRC -** Your task - Your job is to implement a model that tests how many delay cycles will occur for a cache which: - + Follows a 2-way associative scheme - + set size is 4 words (128 bits) (total cache size: a whopping 256 bits) - + Block size is 1 word (32 bits) meaning that we *do not need a block offset*. - + Is write-through write no-allocate (this means that you can ignore stores, only loads will affect the cache) +** TODO Your task + Your job is to implement a *parameterised* model that tests how many delay cycles will occur for a cache with + the following configuration: + + Follows an n-way associative scheme (parameter) + + Is write-through write allocate. + Eviction policy is LRU (least recently used) - - In the typical cache each block has more than 32 bits, requiring an offset, however the - simulated cache does not. - This means that the simulated cache has two sets of 4 words, greatly reducing the complexity - of your implementation. - - Additionally, assume that writes does not change the the LRU counter. - This means that that your cache will only consider which value was most recently loaded, - not written. - It's not realistic, but it allows you to completely disregard write events (you can - just filter them out if you want.) - Your answer should be the number of cache miss latency cycles when using this cache. - -*** Further study - If you have the time I strongly encourage you to experiment with a larger cache with bigger - block sizes, forcing you to implement the additional complexity of block offsets. - Likewise, by trying a different scheme than write-through no-allocate you will get a much - better grasp on how exactly the cache works. - This is *not* a deliverable, just something I encourage you to tinker with to get a better - understanding. + To make this task easier a data structure with stub methods has been implemented for you. + Answer by pasting the output from running the branchProfiler test.