Intel-bigdata · xuechendi · Mar 18, 2020 · Mar 18, 2020 · Mar 5, 2020 · May 11, 2020
diff --git a/.scalafmt.conf b/.scalafmt.conf
@@ -0,0 +1,11 @@
+version = 2.4.2
+align = none
+align.openParenDefnSite = false
+align.openParenCallSite = false
+align.tokens = []
+optIn = {
+  configStyleArguments = false
+}
+danglingParentheses = false
+docstrings = JavaDoc
+maxColumn = 98
diff --git a/core/pom.xml b/core/pom.xml
@@ -13,7 +13,7 @@
     <packaging>jar</packaging>
 
     <groupId>com.intel.spark-pmof.java</groupId>
-    <artifactId>java</artifactId>
+    <artifactId>spark-pmof-java</artifactId>
     <version>1.0</version>
 
     <dependencies>
@@ -40,6 +40,11 @@
             <artifactId>hpnl</artifactId>
             <version>0.5</version>
         </dependency>
+        <dependency>
+            <groupId>com.intel.rpmp</groupId>
+            <artifactId>rpmp</artifactId>
+            <version>0.1</version>
+        </dependency>
         <dependency>
             <groupId>org.xerial</groupId>
             <artifactId>sqlite-jdbc</artifactId>

diff --git a/core/src/main/java/org/apache/spark/storage/pmof/RemotePersistentMemoryPool.java b/core/src/main/java/org/apache/spark/storage/pmof/RemotePersistentMemoryPool.java
@@ -0,0 +1,72 @@
+package org.apache.spark.storage.pmof;
+
+import com.intel.rpmp.PmPoolClient;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public class RemotePersistentMemoryPool {
+  private static String remote_host;
+  private static String remote_port_str;
+
+  private RemotePersistentMemoryPool(String remote_address, String remote_port) throws IOException {
+    pmPoolClient = new PmPoolClient(remote_address, remote_port);
+  }
+
+  public static RemotePersistentMemoryPool getInstance(String remote_address, String remote_port) throws IOException {
+    synchronized (RemotePersistentMemoryPool.class) {
+      if (instance == null) {
+        if (instance == null) {
+          remote_host = remote_address;
+          remote_port_str = remote_port;
+          instance = new RemotePersistentMemoryPool(remote_address, remote_port);
+        }
+      }
+    }
+    return instance;
+  }
+
+  public static int close() {
+    synchronized (RemotePersistentMemoryPool.class) {
+      if (instance != null)
+        return instance.dispose();
+      else
+        return 0;
+    }
+  }
+
+  public static String getHost() {
+    return remote_host;
+  }
+
+  public static int getPort() {
+    return Integer.parseInt(remote_port_str);
+  }
+
+  public int read(long address, long size, ByteBuffer byteBuffer) {
+    return pmPoolClient.read(address, size, byteBuffer);
+  }
+
+  public long put(String key, ByteBuffer data, long size) {
+    return pmPoolClient.put(key, data, size);
+  }
+
+  public long get(String key, long size, ByteBuffer data) {
+    return pmPoolClient.get(key, size, data);
+  }
+
+  public long[] getMeta(String key) {
+    return pmPoolClient.getMeta(key);
+  }
+
+  public int del(String key) throws IOException {
+    return pmPoolClient.del(key);
+  }
+
+  public int dispose() {
+    pmPoolClient.dispose();
+    return 0;
+  }
+
+  private static PmPoolClient pmPoolClient;
+  private static RemotePersistentMemoryPool instance;
+}
diff --git a/core/src/main/scala/org/apache/spark/network/pmof/Client.scala b/core/src/main/scala/org/apache/spark/network/pmof/Client.scala
@@ -4,9 +4,10 @@ import java.nio.ByteBuffer
 import java.util.concurrent.ConcurrentHashMap
 
 import com.intel.hpnl.core.{Connection, EqService}
+import org.apache.spark.internal.Logging
 import org.apache.spark.shuffle.pmof.PmofShuffleManager
 
-class Client(clientFactory: ClientFactory, val shuffleManager: PmofShuffleManager, con: Connection) {
+class Client(clientFactory: ClientFactory, val shuffleManager: PmofShuffleManager, con: Connection) extends Logging {
   final val outstandingReceiveFetches: ConcurrentHashMap[Long, ReceivedCallback] =
     new ConcurrentHashMap[Long, ReceivedCallback]()
   final val outstandingReadFetches: ConcurrentHashMap[Int, ReadCallback] =

diff --git a/core/src/main/scala/org/apache/spark/network/pmof/ClientFactory.scala b/core/src/main/scala/org/apache/spark/network/pmof/ClientFactory.scala
@@ -5,10 +5,11 @@ import java.nio.ByteBuffer
 import java.util.concurrent.ConcurrentHashMap
 
 import com.intel.hpnl.core._
+import org.apache.spark.internal.Logging
 import org.apache.spark.shuffle.pmof.PmofShuffleManager
 import org.apache.spark.util.configuration.pmof.PmofConf
 
-class ClientFactory(pmofConf: PmofConf) {
+class ClientFactory(pmofConf: PmofConf) extends Logging {
   final val eqService = new EqService(pmofConf.clientWorkerNums, pmofConf.clientBufferNums, false).init()
   private[this] final val cqService = new CqService(eqService).init()
   private[this] final val clientMap = new ConcurrentHashMap[InetSocketAddress, Client]()
@@ -28,6 +29,7 @@ class ClientFactory(pmofConf: PmofConf) {
     var client = clientMap.get(socketAddress)
     if (client == null) {
       ClientFactory.this.synchronized {
+        logInfo(s"createClient target is ${address}:${port}")
         client = clientMap.get(socketAddress)
         if (client == null) {
           val con = eqService.connect(address, port.toString, 0)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
@@ -0,0 +1,232 @@
+package org.apache.spark.scheduler
+
+import java.nio.ByteBuffer
+import java.io.{Externalizable, ObjectInput, ObjectOutput}
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import org.roaringbitmap.RoaringBitmap
+
+import org.apache.spark.SparkEnv
+import org.apache.spark.internal.config
+import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.util.Utils
+
+/**
+ * A [[MapStatus]] implementation that tracks the size of each block. Size for each block is
+ * represented using a single byte.
+ *
+ * @param loc location where the task is being executed.
+ * @param compressedSizes size of the blocks, indexed by reduce partition id.
+ */
+private[spark] trait MapStatus {
+
+  /** Location where this task was run. */
+  def location: BlockManagerId
+
+  /**
+   * Estimated size for the reduce block, in bytes.
+   *
+   * If a block is non-empty, then this method MUST return a non-zero size.  This invariant is
+   * necessary for correctness, since block fetchers are allowed to skip zero-size blocks.
+   */
+  def getSizeForBlock(reduceId: Int): Long
+
+}
+
+private[spark] object MapStatus {
+
+  def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): MapStatus = {
+    if (uncompressedSizes.length > 2000) {
+      HighlyCompressedMapStatus(loc, uncompressedSizes)
+    } else {
+      new CompressedMapStatus(loc, uncompressedSizes)
+    }
+  }
+
+  private[this] val LOG_BASE = 1.1
+
+  /**
+   * Compress a size in bytes to 8 bits for efficient reporting of map output sizes.
+   * We do this by encoding the log base 1.1 of the size as an integer, which can support
+   * sizes up to 35 GB with at most 10% error.
+   */
+  def compressSize(size: Long): Byte = {
+    if (size == 0) {
+      0
+    } else if (size <= 1L) {
+      1
+    } else {
+      math.min(255, math.ceil(math.log(size) / math.log(LOG_BASE)).toInt).toByte
+    }
+  }
+
+  /**
+   * Decompress an 8-bit encoded block size, using the reverse operation of compressSize.
+   */
+  def decompressSize(compressedSize: Byte): Long = {
+    if (compressedSize == 0) {
+      0
+    } else {
+      math.pow(LOG_BASE, compressedSize & 0xFF).toLong
+    }
+  }
+}
+
+/**
+ * A [[MapStatus]] implementation that tracks the size of each block. Size for each block is
+ * represented using a single byte.
+ *
+ * @param loc location where the task is being executed.
+ * @param compressedSizes size of the blocks, indexed by reduce partition id.
+ */
+private[spark] class CompressedMapStatus(
+    private[this] var loc: BlockManagerId,
+    private[this] var compressedSizes: Array[Byte])
+    extends MapStatus
+    with Externalizable {
+
+  protected def this() = this(null, null.asInstanceOf[Array[Byte]]) // For deserialization only
+
+  def this(loc: BlockManagerId, uncompressedSizes: Array[Long]) {
+    this(loc, uncompressedSizes.map(MapStatus.compressSize))
+  }
+
+  override def location: BlockManagerId = loc
+
+  override def getSizeForBlock(reduceId: Int): Long = {
+    MapStatus.decompressSize(compressedSizes(reduceId))
+  }
+
+  override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
+    loc.writeExternal(out)
+    out.writeInt(compressedSizes.length)
+    out.write(compressedSizes)
+  }
+
+  override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
+    loc = BlockManagerId(in)
+    val len = in.readInt()
+    compressedSizes = new Array[Byte](len)
+    in.readFully(compressedSizes)
+  }
+}
+
+/**
+ * A [[MapStatus]] implementation that stores the accurate size of huge blocks, which are larger
+ * than spark.shuffle.accurateBlockThreshold. It stores the average size of other non-empty blocks,
+ * plus a bitmap for tracking which blocks are empty.
+ *
+ * @param loc location where the task is being executed
+ * @param numNonEmptyBlocks the number of non-empty blocks
+ * @param emptyBlocks a bitmap tracking which blocks are empty
+ * @param avgSize average size of the non-empty and non-huge blocks
+ * @param hugeBlockSizes sizes of huge blocks by their reduceId.
+ */
+private[spark] class HighlyCompressedMapStatus private (
+    private[this] var loc: BlockManagerId,
+    private[this] var numNonEmptyBlocks: Int,
+    private[this] var emptyBlocks: RoaringBitmap,
+    private[this] var avgSize: Long,
+    private var hugeBlockSizes: Map[Int, Byte])
+    extends MapStatus
+    with Externalizable {
+
+  // loc could be null when the default constructor is called during deserialization
+  require(
+    loc == null || avgSize > 0 || hugeBlockSizes.size > 0 || numNonEmptyBlocks == 0,
+    "Average size can only be zero for map stages that produced no output")
+
+  protected def this() = this(null, -1, null, -1, null) // For deserialization only
+
+  override def location: BlockManagerId = loc
+
+  override def getSizeForBlock(reduceId: Int): Long = {
+    assert(hugeBlockSizes != null)
+    if (emptyBlocks.contains(reduceId)) {
+      0
+    } else {
+      hugeBlockSizes.get(reduceId) match {
+        case Some(size) => MapStatus.decompressSize(size)
+        case None => avgSize
+      }
+    }
+  }
+
+  override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
+    loc.writeExternal(out)
+    emptyBlocks.writeExternal(out)
+    out.writeLong(avgSize)
+    out.writeInt(hugeBlockSizes.size)
+    hugeBlockSizes.foreach { kv =>
+      out.writeInt(kv._1)
+      out.writeByte(kv._2)
+    }
+  }
+
+  override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
+    loc = BlockManagerId(in)
+    emptyBlocks = new RoaringBitmap()
+    emptyBlocks.readExternal(in)
+    avgSize = in.readLong()
+    val count = in.readInt()
+    val hugeBlockSizesArray = mutable.ArrayBuffer[Tuple2[Int, Byte]]()
+    (0 until count).foreach { _ =>
+      val block = in.readInt()
+      val size = in.readByte()
+      hugeBlockSizesArray += Tuple2(block, size)
+    }
+    hugeBlockSizes = hugeBlockSizesArray.toMap
+  }
+}
+
+private[spark] object HighlyCompressedMapStatus {
+  def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): HighlyCompressedMapStatus = {
+    // We must keep track of which blocks are empty so that we don't report a zero-sized
+    // block as being non-empty (or vice-versa) when using the average block size.
+    var i = 0
+    var numNonEmptyBlocks: Int = 0
+    var numSmallBlocks: Int = 0
+    var totalSmallBlockSize: Long = 0
+    // From a compression standpoint, it shouldn't matter whether we track empty or non-empty
+    // blocks. From a performance standpoint, we benefit from tracking empty blocks because
+    // we expect that there will be far fewer of them, so we will perform fewer bitmap insertions.
+    val emptyBlocks = new RoaringBitmap()
+    val totalNumBlocks = uncompressedSizes.length
+    val threshold = Option(SparkEnv.get)
+      .map(_.conf.get(config.SHUFFLE_ACCURATE_BLOCK_THRESHOLD))
+      .getOrElse(config.SHUFFLE_ACCURATE_BLOCK_THRESHOLD.defaultValue.get)
+    val hugeBlockSizesArray = ArrayBuffer[Tuple2[Int, Byte]]()
+    while (i < totalNumBlocks) {
+      val size = uncompressedSizes(i)
+      if (size > 0) {
+        numNonEmptyBlocks += 1
+        // Huge blocks are not included in the calculation for average size, thus size for smaller
+        // blocks is more accurate.
+        if (size < threshold) {
+          totalSmallBlockSize += size
+          numSmallBlocks += 1
+        } else {
+          hugeBlockSizesArray += Tuple2(i, MapStatus.compressSize(uncompressedSizes(i)))
+        }
+      } else {
+        emptyBlocks.add(i)
+      }
+      i += 1
+    }
+    val avgSize = if (numSmallBlocks > 0) {
+      totalSmallBlockSize / numSmallBlocks
+    } else {
+      0
+    }
+    emptyBlocks.trim()
+    emptyBlocks.runOptimize()
+    new HighlyCompressedMapStatus(
+      loc,
+      numNonEmptyBlocks,
+      emptyBlocks,
+      avgSize,
+      hugeBlockSizesArray.toMap)
+  }
+}