Stuff I forgot to commit.

2020-06-29 16:17:24 +02:00 · 2020-06-29 16:17:24 +02:00 · 9f47433501
commit 9f47433501
parent b8ae0092c1
14 changed files with 517 additions and 249 deletions
--- a/Images/BranchPredictor.png
+++ b/Images/BranchPredictor.png
--- a/TODO.org
+++ b/TODO.org
@ -1,57 +0,0 @@
 * Tasks
 ** DONE File IO and test
 ** DONE Stop exploding the heap with logs :DDD
 ** DONE Fix DONE instruction for VM termination
 *** DONE Add setting instructions
 ** DONE Add assembler
 ** DONE Chisel tester
 ** DONE Add LF
 ** DONE Redo colors in fansi. ANSI fucks up string formatting
 ** DONE Columnize log events
 ** DONE Chisel test log evaluator
 ** DONE Create giftWrapper script
 ** DONE Better sourceinfo stuff
   Good enough
 ** DONE Test options
 *** DONE How much NOP pad?
 *** DONE Verbosity?
 *** DONE Which tests?
 ** DONE ish Step counter, pretty print VM log, including final memory state
 ** TODO More programs
 *** DONE Real programs
 *** TODO Basic programs
    Needs more
 ** DONE Merge in LF changes
 ** TODO Breakpoints
 *** TODO VM breakpoints
 **** TODO Record breakpoints in chisel tester
 *** TODO Chisel breakpoints
 **** TODO Freeze processor to record state
 **** TODO Record breakpoints in chisel tester
 *** TODO Draw breakpoints in the printer
 ** TODO Calculate steps needed
 ** TODO Unmangle derailed traces
   With incorrect designs the trace printer ends up printing a lot of diveregent 
   unsychnronizable blocks
 ** DONE Fix DONE instruction
 *** DONE Parse error
 *** DONE Use DONE address
 ** DONE Hazard generator
   good enough
 ** TODO Semantic logging
   Currently logging is quite awkward, a combination of fansi and regular strings.
   Ideally a markdown format such as HTML should be used. There are already plenty
   good scala libraries for this, such as liyaohi's stuff (big shoutout!)
 ** TODO Interactive stepping
   This one is a pretty big undertaking, but it could be very useful to run the circuit in an interactiv
   environment.
   https://venus.cs61c.org/ is a good example of how useful this can be for a virtual machine.
   This task requires pretty good understanding of chisel.
 * Maybe
 ** DONE Move instruction recording to IMEM rather than IF?
   Only care about what IF gets, won't have to deal with whatever logic is in IF.
 ** DONE Figure out why loading instructions backwards made shit werk
   Not as funny as you'd think. The issue was overwriting the last written instruction with 0
--- a/branchProfiler.scala
+++ b/branchProfiler.scala
--- a/src/main/scala/main.scala
+++ b/src/main/scala/main.scala
@ -0,0 +1,7 @@
 package FiveStage
 object main {
  def main(args: Array[String]): Unit = {
    println("helo")
  }
 }
--- a/src/test/resources/tests/programs/constants.s
+++ b/src/test/resources/tests/programs/constants.s
@ -0,0 +1,15 @@
 main:
  li x0, 0x0
  nop
  li x1, 0xABCDEF0
  nop
  li x1, 32
  li x1, 0x800
  li x1, 0x7FF
  nop
  nop
  done
 #regset t0,10
 #regset t1,23
 #regset t2,43
 #regset t3,-11
--- a/src/test/resources/tests/programs/halfwords.s
+++ b/src/test/resources/tests/programs/halfwords.s
@ -0,0 +1,18 @@
 main:	
  li x1, 0x11223344
  li x2, 0x55667788
  li x10, 0x100
  sw x1,  0(x10)
  sw x2,  4(x10)
  lw x3,  2(x10)
  lh x4,  3(x10)
  lb x5,  3(x10)
  lhu x6, 3(x10)
  lbu x7, 3(x10)
  sw x1, 8(x10)
  sw x1, 9(x10)
  sh x2, 9(x10)
  sb x2, 11(x10)
  lw x12, 8(x10)
  lw x13, 12(x10)
  done
--- a/src/test/scala/Manifest.scala
+++ b/src/test/scala/Manifest.scala
@ -1,4 +1,5 @@
 package FiveStage
 import org.scalatest.{Matchers, FlatSpec}
 import cats._
 import cats.implicits._
@ -20,7 +21,7 @@ object Manifest {
  val singleTest = "forward2.s"
-  val nopPadded = true
+  val nopPadded = false
  val singleTestOptions = TestOptions(
    printIfSuccessful  = true,
@ -53,18 +54,15 @@ object Manifest {
 class ProfileBranching extends FlatSpec with Matchers {
  it should "profile some branches" in {
-    TestRunner.profileBranching(
+    BranchProfiler.profileBranching(
-      Manifest.singleTestOptions.copy(testName = "branchProfiling.s", maxSteps = 50000)
+      Manifest.singleTestOptions.copy(testName = "branchProfiling.s", maxSteps = 150000)
    ) should be(true)
  }
 }
 class ProfileCache extends FlatSpec with Matchers {
  it should "profile a cache" in {
-    say("Warning, this test takes forever to run! 2 minutes on my machine at least.")
+    CacheProfiler.profileCache(
    say("This happens due to the less than optimal way of storing the update log. Sorry I guess")
    say("You probably want to debug this with a smaller program")
    TestRunner.profileCache(
      Manifest.singleTestOptions.copy(testName = "convolution.s", maxSteps = 150000)
    ) should be(true)
  }
--- a/src/test/scala/RISCV/DataTypes.scala
+++ b/src/test/scala/RISCV/DataTypes.scala
@ -176,8 +176,16 @@ object Data {
      val bitsRight = 32 - slots.log2
      val leftShifted = i << bitsLeft
      val rightShifted = leftShifted >>> bitsRight
-      // say(i)
+      rightShifted
-      // say(rightShifted)
+    }
    // To get the entire word call with from = 31, to = 0
    def bits(from: Int, to: Int): Int = {
      val bitsLeft = 31 - from
      val bitsRight = bitsLeft + to
      val leftShifted = i << bitsLeft
      val rightShifted = leftShifted >>> bitsRight
      rightShifted
    }
  }
--- a/src/test/scala/RISCV/branchProfiler.scala
+++ b/src/test/scala/RISCV/branchProfiler.scala
@ -0,0 +1,147 @@
 package FiveStage
 import org.scalatest.{Matchers, FlatSpec}
 import cats._
 import cats.implicits._
 import fileUtils._
 import chisel3.iotesters._
 import scala.collection.mutable.LinkedHashMap
 import fansi.Str
 import Ops._
 import Data._
 import VM._
 import PrintUtils._
 import LogParser._
 object BranchProfiler {
  def profileBranching(testOptions: TestOptions): Boolean = {
    val testResults = for {
      lines                           <- fileUtils.readTest(testOptions)
      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
    } yield {
      sealed trait BranchEvent
      case class Taken(from: Int, to: Int) extends BranchEvent { override def toString = s"Taken      ${from.hs}\t${to.hs}" }
      case class NotTaken(addr: Int) extends BranchEvent { override def toString =       s"Not Taken  ${addr.hs}" }
      val events: List[BranchEvent] = trace.flatMap(_.event).collect{
        case PcUpdateBranch(from, to) => Taken(from.value, to.value)
        case PcUpdateNoBranch(at) => NotTaken(at.value)
      }
      /**
        * This is a sample profiler for a rather unrealistic branch predictor which has an unlimited amount
        * of slots
        */
      def OneBitInfiniteSlots(events: List[BranchEvent]): Int = {
        // Uncomment to take a look at the event log
        // say(events.mkString("\n","\n","\n"))
        // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated
        // to reflect this.
        // As long as there are remaining events the helper calls itself recursively on the remainder
        def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = {
          events match {
            // Scala syntax for matching a list with a head element of some type and a tail
 	    // `case h :: t =>`
 	    // means we want to match a list with at least a head and a tail (tail can be Nil, so we
 	    // essentially want to match a list with at least one element)
 	    // h is the first element of the list, t is the remainder (which can be Nil, aka empty)
 	    // `case Constructor(arg1, arg2) :: t => `
 	    // means we want to match a list whose first element is of type Constructor, giving us access to its internal
 	    // values.
 	    // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))`
 	    // means we want to match a list whose first element is of type Constructor while satisfying some predicate p,
 	    // called an if guard.
            case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
            case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
            case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
            case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
            case Nil => 0
          }
        }
        // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken
        def initState = events.map{
          case Taken(from, addr) => (from, false)
          case NotTaken(addr)    => (addr, false)
        }.toMap
        helper(events, initState)
      }
      def twoBitPredictor(events: List[BranchEvent], slots: Int): Int = {
        case class nBitPredictor(
          values          : List[Int],
          predictionRules : List[Boolean],
          transitionRules : Int => Boolean => Int,
        ){
          val slots = values.size
          def predict(pc: Int): Boolean = predictionRules(values(pc.getTag(slots)))
          def update(pc: Int, taken: Boolean): nBitPredictor = {
            val current = values(pc.getTag(slots))
            val next = copy(values = values.updated(pc.getTag(slots), transitionRules(current)(taken)))
            next
          }
          override def toString = values.map(x => x.binary(2)).mkString("[","][","]")
        }
        val initPredictor = nBitPredictor(
          List.fill(slots)(0),
          List(
            false,
            false,
            true,
            true,
          ),
          r => r match {
            case 0 => taken => if(taken) 1 else 0
            case 1 => taken => if(taken) 2 else 0
            case 2 => taken => if(taken) 3 else 1
            case 3 => taken => if(taken) 3 else 2
          }
        )
        events.foldLeft((0, initPredictor)){ case(((acc, bp), event)) =>
          println()
          say(s"total misses: $acc")
          say(event)
          event match {
            case Taken(pc, _) => say(s"taken at tag:     ${pc.getTag(slots)}")
            case NotTaken(pc) => say(s"not taken at tag: ${pc.getTag(slots)}")
          }
          say(bp)
          event match {
            case Taken(pc, _) if bp.predict(pc)  => {say("HIT!");  (acc,     bp.update(pc, true))}
            case Taken(pc, _)                    => {say("MISS!"); (acc + 1, bp.update(pc, true))}
            case NotTaken(pc) if !bp.predict(pc) => {say("HIT!");  (acc,     bp.update(pc, false))}
            case NotTaken(pc)                    => {say("MISS!"); (acc + 1, bp.update(pc, false))}
          }
        }._1
      }
      say(events.mkString("\n","\n","\n"))
      say(twoBitPredictor(events, 8))
    }
    true
  }
 }
--- a/src/test/scala/RISCV/cacheProfiler.scala
+++ b/src/test/scala/RISCV/cacheProfiler.scala
@ -0,0 +1,204 @@
 package FiveStage
 import org.scalatest.{Matchers, FlatSpec}
 import cats._
 import cats.implicits._
 import fileUtils._
 import chisel3.iotesters._
 import scala.collection.mutable.LinkedHashMap
 import fansi.Str
 import Ops._
 import Data._
 import VM._
 import PrintUtils._
 import LogParser._
 object CacheProfiler {
  def profileCache(testOptions: TestOptions): Boolean = {
    val testResults = for {
      lines                           <- fileUtils.readTest(testOptions)
      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
    } yield {
      import TestUtils._
      sealed trait MemoryEvent
      case class Write(addr: Int) extends MemoryEvent
      case class Read(addr: Int) extends MemoryEvent
      val events = trace.flatMap(_.event).collect{
        case MemWrite(addr, _) => Write(addr.value)
        case MemRead(addr, _) => Read(addr.value)
      }
      class CacheProfiler(setCount: Int, setSize: Int, blockSize: Int){
        // If we set counter to 0 we risk evicting the first allocated block.
        var counter = 1
        var misses = 0
        var mostRecent = 0
        var wasMiss = false
        implicit class AddrOps(i: Int){
          val blockOffsetBits = blockSize.log2
          val lineBits = setSize.log2
          def lineIdx: Int = {
            i.bits(2 + blockOffsetBits + lineBits - 1, 2 + blockOffsetBits)
          }
        }
        case class CacheLine(tag: Int, lastUsed: Int){
          def matches(addr: Int): Boolean = List.fill(blockSize)(tag)
            .zipWithIndex
            .map{ case(btag, idx) => btag + idx*4 }
            .map(_ == addr)
            .foldLeft(false)(_ || _)
          def renderContent(addr: Int): String = (addr == mostRecent, wasMiss) match {
            case (true, true)  => Console.RED + addr.hs + Console.RESET
            case (true, false) => Console.GREEN + addr.hs + Console.RESET
            case _ => addr.hs
          }
          def render: String = {
            val blockContents = List.fill(blockSize)(tag)
              .zipWithIndex
              .map{ case(btag, idx) => renderContent(btag + idx*4) }
              .mkString("Contents: || ", " | ", " |")
            s"Base: ${tag.hs} LRU: $lastUsed\t" + blockContents
          }
        }
        object CacheLine {
          def truncateTag(addr: Int) = addr - (addr % (blockSize*4))
        }
        case class CacheSet(blocks: Array[CacheLine]){
          def lineIdx(addr: Int): Int = addr.lineIdx
          def contains(addr: Int): Boolean = blocks.map(_.matches(addr)).foldLeft(false)(_ || _)
          def updateLRU(addr: Int): Unit = {
            val idx = lineIdx(addr)
            val next = blocks(idx).copy(lastUsed = counter)
            blocks(idx) = next
          }
          def render: String = {
            blocks.map(_.render).mkString("\n", "\n", "\n")
          }
        }
        case class Cache(sets: Array[CacheSet]){
          /** returns the index of set if hit */
          def checkHit(addr: Int): Option[Int] = sets
            .zipWithIndex
            .map{ case(set, idx) => Option.when(set.contains(addr))(idx) }
            .flatten.headOption
          /** Updates the LRU counter */
          def updateLRU(addr: Int, setIdx: Int): Unit = sets(setIdx).updateLRU(addr)
          /** Gets set with least recently used */
          def getLRU(addr: Int): Int = sets
            .map( set => set.blocks(set.lineIdx(addr)).lastUsed)
            .zipWithIndex
            .sortBy(_._1)
            .map(_._2)
            .head
          /** Entry point */
          def handleAccess(addr: Int): Unit = {
            mostRecent = addr
            counter += 1
            checkHit(addr) match {
              case Some(setIdx) => {
                wasMiss = false
                updateLRU(addr, setIdx)
                // say(s"${addr.hs} HIT")
              }
              case None => {
                val set = sets(getLRU(addr))
                val nextTag = CacheLine.truncateTag(addr)
                set.blocks(set.lineIdx(addr)) = set.blocks(set.lineIdx(addr)).copy(
                  tag = nextTag,
                  lastUsed = counter
                )
                misses += 1
                wasMiss = true
                // say(s"${addr.hs} MISS")
                // say(s"BLOCK ${addr.lineIdx} IN SET ${getLRU(addr)} EVICTED. BYE BYE")
              }
            }
          }
          /** Pretty pictures! */
          def render: String = {
            sets.map(_.render).mkString("\n", "\n", "\n")
          }
        }
        object Cache {
          def init: Cache = Cache(Array.fill(setCount)(
            CacheSet(Array.fill(setSize)(CacheLine(57005, 0))))
          )
        }
      }
      for{
        sets <- List(2, 4, 8)
        blockSize <- List(4, 8)
        lines <- List(2, 4, 8)
      } yield {
        val myTest = new CacheProfiler(sets, lines, blockSize)
        val myCache = myTest.Cache.init
        events.foreach{
          case Write(addr) => myCache.handleAccess(addr)
          case Read(addr) => myCache.handleAccess(addr)
        }
        say(s"sets: $sets, lines: $lines, blockSize: $blockSize yields ${myTest.misses} misses")
      }
      // val myTest = new CacheProfiler(2, 4, 4)
      // val myCache = myTest.Cache.init
      // events.foreach{
      //   case Write(addr) => {
      //     say(addr.hs)
      //     myCache.handleAccess(addr)
      //     say(myCache.render)
      //   }
      //   case Read(addr) => {
      //     say(addr.hs)
      //     myCache.handleAccess(addr)
      //     say(myCache.render)
      //   }
      // }
      // say(myTest.misses)
    }
    true
  }
 }
--- a/src/test/scala/RISCV/testRunner.scala
+++ b/src/test/scala/RISCV/testRunner.scala
@ -101,98 +101,4 @@ object TestRunner {
      successful
    }.toOption.getOrElse(false)
  }
  def profileBranching(testOptions: TestOptions): Boolean = {
    val testResults = for {
      lines                           <- fileUtils.readTest(testOptions)
      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
    } yield {
      sealed trait BranchEvent
      case class Taken(from: Int, to: Int) extends BranchEvent { override def toString = s"Taken      ${from.hs}\t${to.hs}" }
      case class NotTaken(addr: Int) extends BranchEvent { override def toString =       s"Not Taken  ${addr.hs}" }
      val events: List[BranchEvent] = trace.flatMap(_.event).collect{
        case PcUpdateBranch(from, to) => Taken(from.value, to.value)
        case PcUpdateNoBranch(at) => NotTaken(at.value)
      }
      /**
        * This is a sample profiler for a rather unrealistic branch predictor which has an unlimited amount
        * of slots
        */
      def OneBitInfiniteSlots(events: List[BranchEvent]): Int = {
        // Uncomment to take a look at the event log
        // say(events.mkString("\n","\n","\n"))
        // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated
        // to reflect this.
        // As long as there are remaining events the helper calls itself recursively on the remainder
        def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = {
          events match {
            // Scala syntax for matching a list with a head element of some type and a tail
 	    // `case h :: t =>`
 	    // means we want to match a list with at least a head and a tail (tail can be Nil, so we
 	    // essentially want to match a list with at least one element)
 	    // h is the first element of the list, t is the remainder (which can be Nil, aka empty)
 	    // `case Constructor(arg1, arg2) :: t => `
 	    // means we want to match a list whose first element is of type Constructor, giving us access to its internal
 	    // values.
 	    // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))`
 	    // means we want to match a list whose first element is of type Constructor while satisfying some predicate p,
 	    // called an if guard.
            case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
            case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
            case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
            case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
            case Nil => 0
          }
        }
        // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken
        def initState = events.map{
          case Taken(from, addr) => (from, false)
          case NotTaken(addr)    => (addr, false)
        }.toMap
        helper(events, initState)
      }
      say(OneBitInfiniteSlots(events))
    }
    true
  }
  def profileCache(testOptions: TestOptions): Boolean = {
    val testResults = for {
      lines                           <- fileUtils.readTest(testOptions)
      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
    } yield {
      sealed trait MemoryEvent
      case class Write(addr: Int) extends MemoryEvent
      case class Read(addr: Int) extends MemoryEvent
      val events: List[MemoryEvent] = trace.flatMap(_.event).collect{
        case MemWrite(x,_) => Write(x.value)
        case MemRead(x,_) => Read(x.value)
      }
      // Your cache here
    }
    true
  }
 }
--- a/src/test/scala/TestUtils.scala
+++ b/src/test/scala/TestUtils.scala
@ -6,6 +6,10 @@ import PrintUtils._
 object TestUtils {
  implicit class OptionBackport(t: Option.type){
    def when[T](b: Boolean)(t: => T) = if(b) Some(t) else None
  }
  /**
    * Generate and serialize BTrees for the test runner
    */
--- a/src/test/scala/chiselTestRunner.scala
+++ b/src/test/scala/chiselTestRunner.scala
@ -142,7 +142,7 @@ private class ChiselTestRunner (
  // After finishing, let the circuit run until all updates can be committed.
  private def flush: List[CircuitTrace] =
-    (0 to 3).map(_ => stepOne).reverse.toList
+    (0 to 4).map(_ => stepOne).reverse.toList
  /**
    * Run the entire shebang
--- a/theory2.org
+++ b/theory2.org
@ -1,36 +1,55 @@
-* Question 1 - Hazards
+* Question 0 - Testing hazards
-  For the following programs describe each hazard with type (data or control), line number and a
+  This question is mandatory, but rewards no points (not directly at least).
  small (max one sentence) description
-** program 1
+  The tests found in the testing framework are useful for testing a fully working processor, however it
-  #+begin_src asm
+  leaves much to be desired for when you actually want to design one from whole cloth.
-    addi t0,   zero,  10
+
-    addi t1,   zero,  20
+  To rectify this, you should write some tests of your own that should serve as a minimal case for various
-  L2:
+  hazards that you will encounter. You do not need to deliver anything here, but I expect you to have
-    sub  t1,   t1,    t0
+  these tests if you ask me for help debugging your design during lab hours.
-    beq  t1,   zero, .L2
+  (You can of course come to lab hours if you're having trouble writing these tests)
    jr   ra
  #+end_src
-** program 2
+** Forwarding
-  #+begin_src asm
+   The tests in forward1.s and forward2.s are automatically generated, long, and non-specific,
-    addi t0,   zero,  10
+   thus not very suited for debugging.
-    lw   t0,   10(t0)
+
-    beq  t0,   zero,  .L3
+   You should write one (or more) test(s) that systematically expose your processor to dependency
-    jr   ra
+   hazards, including instructions that:
-  #+end_src
+   + Needs forwarding from MEM and WB (i.e dependencies with NOPs between them).
   + Exposes results that should *not* be forwarded due to regWrite being false.
   + Writes and reads to/from the zero register.
-** program 3
+** Load freezes
-  #+begin_src asm
+   Loads freezes are tricky since they have an interaction with the forwarding unit, often causing
-  lw   t0,   0(t0)
+   bugs that appear with low frequency in the supplied test programs.
-  lw   t1,   4(t0)
+
-  sw   t0,   8(t1)
+   You should write tests (I suggest one test per case) that systematically expose your processor to
-  lw   t1,   12(t0)
+   dependency hazards where one or more of the dependencies are memory accesses, including instructions that:
-  beq  t0,   t1,  .L3
+   + Needs forwarding from MEM and WB where MEM, WB or both are load instructions.
-  jr   ra
+   + Exposes false dependencies from MEM and WB where one or more are loads.
-  #+end_src
+     For instance, consider ~addi x1, x1, 0x10~ in machine code with the rs2 field highlighted:
     0x00a08093 = 0b00000000 | 10100 | 0001000000010010011
     In this case there is a false dependency on x20 since x20 is only an artefact of the immediate
     value which could cause an unecessary freeze.
   + Writes and reads to/from the zero register, which could trigger an unecessary freeze
   + Instructions that causes multiple freezes in a row.
   + Instructions that causes multiple freezes in a row followed by an instruction with multiple
     dependencies.
 ** Control hazards
   There are a lot of possible interactions when jumping and branching, you need to write tests
   that ensures that instructions are properly bubbled if they shouldn't have been fetched.
   You should also test for interactions between forwarding and freezing here, i.e what happens
   when the address calculation relies on forwarded values? What happens if the forwarded value
   comes from a load instruction necessitating a freeze?
 * TODO Question 1 - Hazards
  Write programs here that are less of a crapshoot. Clarify dependency vs hazards etc etc and
  *enforce* a format that is easy to grade.
 * Question 2 - Handling hazards
@ -39,7 +58,7 @@
 ** Data hazards 1
   At some cycle the following instructions can be found in a 5 stage design:
-   
+
   #+begin_src text
   EX:                  ||     MEM:                ||      WB:
   ---------------------||-------------------------||--------------------------
@ -52,13 +71,17 @@
   branch   = false     ||     branch   = true     ||      branch   = false
   jump     = false     ||     jump     = false    ||      jump     = false
   #+end_src
-   
+
   For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2?
-   
+   Answer should be on the form:
   rs1: Narnia
   rs2: Wikipedia
 ** Data hazards 2
   At some cycle the following instructions can be found in a 5 stage design:
-   
+
   #+begin_src text
   EX:                  ||     MEM:                ||      WB:
   ---------------------||-------------------------||--------------------------
@ -73,11 +96,15 @@
   #+end_src
   For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2?
   Answer should be on the form:
   rs1: Random noise
   rs2: WB (MEM if it's a tuesday)
 ** Data hazards 3
   At some cycle the following instructions can be found in a 5 stage design:
-   
+
   #+begin_src text
   EX:                  ||     MEM:                ||      WB:
   ---------------------||-------------------------||--------------------------
@ -89,24 +116,26 @@
   memWrite = true      ||     memWrite = false    ||      memWrite = false
   branch   = false     ||     branch   = false    ||      branch   = false
   jump     = false     ||     jump     = false    ||      jump     = false
   Should the forwarding unit issue a load hazard signal?
   (Hint: what are the semantics of the instruction currently in EX stage?)
   #+end_src
   Should the forwarding unit issue a load hazard signal? *This is a yes/no question*
   (Hint: what are the semantics of the instruction currently in EX stage?)
 * Question 3 - Branch prediction
-  Consider a 2 bit branch predictor with only 4 slots for a 32 bit architecture (without BTB), where the decision to 
+  Consider a 2 bit branch predictor with only 4 slots for a 32 bit architecture (without BTB), where the decision to
  take a branch or not is decided in accordance to the following table:
  #+begin_src text
  state  ||  predict taken  ||  next state if taken  ||  next state if not taken ||
  =======||=================||=======================||==========================||
  00     ||  NO             ||  01                   ||  00                      ||
-  01     ||  NO             ||  10                   ||  00                      ||
+  01     ||  NO             ||  11                   ||  00                      ||
-  10     ||  YES            ||  11                   ||  01                      ||
+  10     ||  YES            ||  11                   ||  00                      ||
  11     ||  YES            ||  11                   ||  10                      ||
  #+end_src
-  
+
-  (This is known as a saturating 2bit counter, it is *not* the same scheme as in the lecture slides)
+  Which corresponds to this figure:
  #+CAPTION: FSM of a 2 bit branch predictor. Note that it is not a 2bit saturating counter.
  [[./Images/BranchPredictor.png]]
  At some point during execution the program counter is ~0xc~ and the branch predictor table looks like this:
  #+begin_src text
@ -114,21 +143,34 @@
  ======||========
  00    ||  01
  01    ||  00
-  10    ||  11
+  10    ||  01
-  11    ||  01
+  11    ||  10
  #+end_src
  For the following program:
  #+begin_src asm
-  0xc  addi x1, x3, 10
+  .L1:
-  0x10 add  x2, x1, x1
+  0x0C addi x1, x1, 1
-  0x14 beq  x1, x2, .L1 
+  0x10 add  x2, x2, x1
  0x14 bge  x2, x3, .L1
  0x18 j    .L2
  .L3:
  0x1C addi x2, x2, 0x10
  0x20 slli x2, 0x4
  0x24 jr   ra
  #+end_src
  Will the predictor predict taken or not taken for the beq instruction?
-* Question 4 - Benchmarking
+  At cycle 0 the state of the machine is as following:
  #+begin_src text
  PC = 0x0C
  x1 = 0x0
  x2 = 0x0
  x3 = 0x7
  #+end_src
  At which cycle will the PC be 0x24 given a 2 cycle delay for mispredicts?
 * Question 4 - Benchmarking a branch profiler
  In order to gauge the performance increase from adding branch predictors it is necessary to do some testing.
  Rather than writing a test from scratch it is better to use the tester already in use in the test harness.
  When running a program the VM outputs a log of all events, including which branches have been taken and which
@ -148,7 +190,7 @@
  To help you get started, I have provided you with much of the necessary code.
  In order to get an idea for how you should profile branch misses, consider the following profiler which calculates
-  misses for a processor with a branch predictor with a 1 bit predictor with infinite memory:
+  misses for a processor with a branch predictor with a 1 bit predictor with infinite slots:
  #+BEGIN_SRC scala
  def OneBitInfiniteSlots(events: List[BranchEvent]): Int = {
@ -172,11 +214,11 @@
 	// `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))`
 	// means we want to match a list whose first element is of type Constructor while satisfying some predicate p,
 	// called an if guard.
-        case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
+	case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
-        case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
+	case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
-        case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
+	case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
-        case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
+	case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
-        case _ => 0
+	case _ => 0
      }
    }
@ -207,7 +249,7 @@
   With a 2 bit 8 slot scheme, how many mispredicts will happen?
   Answer with a number.
-   
+
   Hint: Use the getTag method defined on int (in DataTypes.scala) to get the tag for an address.
   #+BEGIN_SRC scala
   val slots = 8
@ -221,7 +263,7 @@
   say(0x1C5C.getTag(slots)) // prints 7
   say(0x1C60.getTag(slots)) // prints 0 (thus conflicts with 0x1C40)
   #+END_SRC
-   
+
 * Question 5 - Cache profiling
  Unlike our design which has a very limited memory pool, real designs have access to vast amounts of memory, offset
@ -231,11 +273,6 @@
  In order to investigate how caches can alter performance it is therefore necessary to make some rather
  unrealistic assumptions to see how different cache schemes impacts performance.
  We will therefore assume the following:
  + Reads from main memory takes 5 cycles
  + cache has a total storage of 8 words (256 bits)
  + cache reads work as they do now (i.e no additional latency)
  For this exercise you will write a program that parses a log of memory events, similar to previous task
  #+BEGIN_SRC scala
  sealed trait MemoryEvent
@ -246,32 +283,13 @@
  def profile(events: List[MemoryEvent]): Int = ???
  #+END_SRC
-** Your task
+** TODO Your task
-   Your job is to implement a model that tests how many delay cycles will occur for a cache which:
+   Your job is to implement a *parameterised* model that tests how many delay cycles will occur for a cache with
-   + Follows a 2-way associative scheme
+   the following configuration:
-   + set size is 4 words (128 bits) (total cache size: a whopping 256 bits)
+   + Follows an n-way associative scheme (parameter)
-   + Block size is 1 word (32 bits) meaning that we *do not need a block offset*.
+   + Is write-through write allocate.
   + Is write-through write no-allocate (this means that you can ignore stores, only loads will affect the cache)
   + Eviction policy is LRU (least recently used)
   In the typical cache each block has more than 32 bits, requiring an offset, however the
   simulated cache does not.
   This means that the simulated cache has two sets of 4 words, greatly reducing the complexity
   of your implementation.
   Additionally, assume that writes does not change the the LRU counter. 
   This means that that your cache will only consider which value was most recently loaded,
   not written.
   It's not realistic, but it allows you to completely disregard write events (you can
   just filter them out if you want.)
-   Your answer should be the number of cache miss latency cycles when using this cache.
+   To make this task easier a data structure with stub methods has been implemented for you.
 *** Further study
    If you have the time I strongly encourage you to experiment with a larger cache with bigger
    block sizes, forcing you to implement the additional complexity of block offsets.
    Likewise, by trying a different scheme than write-through no-allocate you will get a much
    better grasp on how exactly the cache works.
    This is *not* a deliverable, just something I encourage you to tinker with to get a better
    understanding.
   Answer by pasting the output from running the branchProfiler test.