scalalab3 · AlexTheLost · Apr 18, 2016 · Apr 21, 2016 · Apr 21, 2016 · Apr 21, 2016
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,6 @@ project/plugins/project/
 # Scala-IDE specific
 .scala_dependencies
 .worksheet
+
+# IntelliJ IDEA
+/.idea
diff --git a/build.sbt b/build.sbt
@@ -0,0 +1,12 @@
+name := "lyrics-engine"
+
+version := "1.0"
+
+scalaVersion := "2.11.8"
+
+
+lazy val raw_data_processor = project in file("raw_data_processor")
+lazy val data_analyzer = project in file("data_analyzer")
+lazy val http_api = project in file("http_api")
+
+// TODO Setup visibility common dependencies.
diff --git a/data_analyzer/build.sbt b/data_analyzer/build.sbt
@@ -0,0 +1,5 @@
+name := "data_analyzer"
+
+version := "1.0"
+
+scalaVersion := "2.11.8"
diff --git a/http_api/build.sbt b/http_api/build.sbt
@@ -0,0 +1,5 @@
+name := "http_api"
+
+version := "1.0"
+
+scalaVersion := "2.11.8"
diff --git a/raw_data_processor/build.sbt b/raw_data_processor/build.sbt
@@ -0,0 +1,9 @@
+name := "raw_data_processor"
+
+version := "1.0"
+
+scalaVersion := "2.11.8"
+
+
+libraryDependencies += "com.typesafe" % "config" % "1.3.0"
+libraryDependencies += "org.scalatest" % "scalatest_2.11" % "2.2.6"
diff --git a/raw_data_processor/project/build.properties b/raw_data_processor/project/build.properties
@@ -0,0 +1 @@
+sbt.version = 0.13.8
diff --git a/raw_data_processor/project/plugins.sbt b/raw_data_processor/project/plugins.sbt
@@ -0,0 +1 @@
+logLevel := Level.Warn
diff --git a/raw_data_processor/src/main/resources/application.conf b/raw_data_processor/src/main/resources/application.conf
@@ -0,0 +1 @@
+path_to_msd = "C:\\Temp\\mxm_dataset_train.txt"
diff --git a/raw_data_processor/src/main/scala/scalalab3/lyricsengine/processor/SongProcessor.scala b/raw_data_processor/src/main/scala/scalalab3/lyricsengine/processor/SongProcessor.scala
@@ -0,0 +1,7 @@
+package scalalab3.lyricsengine.processor
+
+/**
+  * Performs operations above songs.
+  */
+object SongProcessor {
+}
diff --git a/raw_data_processor/src/main/scala/scalalab3/lyricsengine/processor/WordsProcessor.scala b/raw_data_processor/src/main/scala/scalalab3/lyricsengine/processor/WordsProcessor.scala
@@ -0,0 +1,7 @@
+package scalalab3.lyricsengine.processor
+
+/**
+  * Performs operations above words.
+  */
+object WordsProcessor {
+}
diff --git a/raw_data_processor/src/main/scala/scalalab3/lyricsengine/reader/MSD.scala b/raw_data_processor/src/main/scala/scalalab3/lyricsengine/reader/MSD.scala
@@ -0,0 +1,9 @@
+package scalalab3.lyricsengine.reader
+
+
+/**
+  * That class represent data from Million Song Dataset (MSD).
+  */
+case class MSD(words: Seq[String], songs: Seq[Song])
+
+case class Song(trackId: String, mxmTrackId: String, words: Map[Int, Int])
diff --git a/raw_data_processor/src/main/scala/scalalab3/lyricsengine/reader/Reader.scala b/raw_data_processor/src/main/scala/scalalab3/lyricsengine/reader/Reader.scala
@@ -0,0 +1,49 @@
+package scalalab3.lyricsengine.reader
+
+import com.typesafe.config.ConfigFactory
+
+import scala.io.Source
+
+object Reader {
+  val conf = ConfigFactory.load();
+
+  def main(args: Array[String]) {
+    print(extract())
+  }
+
+  def extract(): MSD = {
+    val path = conf.getString("path_to_msd")
+
+    val pair = read(path)
+
+    val words = extractWords(pair._1)
+    val songs = extractSongs(pair._2)
+
+    new MSD(words, songs)
+  }
+
+  private def read(path: String) = {
+    val lines = Source.fromFile(path).getLines().toSeq
+    val words = lines(17)
+    val songs = lines.slice(18, lines.size)
+    (words, songs)
+  }
+
+  private def extractWords(words: String) = words.substring(1).split(",");
+
+  private def extractSongs(songsData: Seq[String]) = songsData.map(songData => {
+    val elements = songData.split(",")
+    val trackId = elements(0)
+    val mxmTrackId = elements(1)
+    val words_map = extractSongWords(elements.slice(2, elements.size))
+    new Song(trackId, mxmTrackId, words_map);
+  })
+
+  private def extractSongWords(words: Array[String]): Map[Int, Int] = words.toSeq.map(source_pair => {
+    val pair = source_pair.split(":")
+    assert(pair.size == 2)
+    pair(0).toInt -> pair(1).toInt
+  }).toMap;
+}
+
+
diff --git a/raw_data_processor/src/test/resources/test_msd.txt b/raw_data_processor/src/test/resources/test_msd.txt
@@ -0,0 +1,20 @@
+# TRAINING SET
+# MusiXmatch dataset, the official lyrics dataset
+# of the Million Song Dataset
+#    file created on Tue Mar 29 04:28:44 2011
+#    contact: T. Bertin-Mahieux (Columbia University)
+#             [email protected]
+#    also: http://labrosa.ee.columbia.edu/millionsong/musixmatch
+#          http://www.musixmatch.com
+# FORMAT:
+#     #   - comment, to ignore
+#     %   - list of top words, comma-separated
+#         - normal line, contains track_id, mxm track id,
+#           then word count for each of the top words, comma-separated
+#           word count is in sparse format -> ...,<word idx>:<cnt>,...
+#           <word idx> starts at 1 (not zero!)
+# All our work is done using UTF-8 encoding.
+# enjoy!
+%i,the,you,to,and,a,me,it,not,in,my,is,of,your,that,do,on,are,we,am,will,all,for,no,be,have,love,so,know,this,but,with,what,just,when,like,now,que,time,can,come,de,there,go,up,oh,la,one,they,out,down,get,she,was,see,if,got,never,from,he,feel,want,let,make,way,say,take,would,as,ca,day,at,babi,away
+TRZZZYV128F92E996D,6849828,1:10,2:6,3:20,5:2,7:30,8:1,9:6,10:3,11:19,12:1,13:16,14:6,15:1,18:4,20:2,21:2,23:1,24:4,26:4,31:3,35:1,37:2,50:15,52:21,56:1,57:3,59:3,63:6,68:2,74:3,75:3,84:1,92:1,94:3,100:1,101:2,111:2,121:3,123:5,142:3,144:1,155:1,162:31,167:6,184:3,203:3,216:1,227:1,240:2,261:1,358:1,368:2,372:4,382:3,403:2,415:2,434:1,467:3,485:1,571:1,592:3,627:4,672:6,1023:1,1123:1,1147:1,1247:1,1262:1,1294:1,1375:2,1391:15,1542:1,1619:1,2386:1,4817:3
+TRZZZYX128F92D32C6,681124,1:4,2:18,4:3,5:6,6:9,7:1,8:5,9:1,10:12,11:2,12:9,13:5,15:4,17:2,18:4,19:4,21:2,22:3,25:3,29:2,30:3,32:5,34:3,35:2,37:2,39:3,43:2,44:7,45:2,49:2,50:2,52:8,54:2,56:1,59:1,63:10,64:1,66:1,71:2,72:1,78:1,79:1,84:2,91:1,92:2,95:1,96:1,103:1,110:1,115:1,127:1,134:4,135:2,136:3,137:1,138:2,140:1,146:2,150:1,152:1,192:1,206:1,207:1,222:3,239:1,248:2,258:1,270:1,274:1,283:1,294:2,300:1,305:1,318:1,337:1,338:1,346:1,347:1,349:1,371:1,398:1,406:1,445:1,451:1,459:1,478:1,487:1,492:1,502:3,516:1,539:1,548:2,549:1,553:1,592:2,617:1,633:1,666:1,681:1,727:1,774:1,775:7,789:1,811:1,844:1,915:1,942:1,950:1,979:1,1008:1,1040:1,1080:9,1142:1,1232:1,1366:1,1409:1,1412:1,1537:1,1545:1,1597:1,1841:2,1861:1,1901:1,1907:1,2063:1,2097:1,2167:1,2198:1,2221:1,2468:1,2498:1,2595:1,2698:1,2975:1,2990:1,2996:1,3256:1,3267:1,3316:2,3355:1,4198:1,4356:1,4738:1,4845:1
diff --git a/raw_data_processor/src/test/scala/ReaderTest.scala b/raw_data_processor/src/test/scala/ReaderTest.scala
@@ -0,0 +1,9 @@
+import org.scalatest.{FlatSpec, Matchers}
+
+class ReaderTest extends FlatSpec with Matchers {
+
+  "A MSD" should "contains" in {
+    //    val raw = Source.fromInputStream(getClass.getResourceAsStream("/test_msd.txt")).mkString
+    //    Reader.extract("/test_msd.txt")
+  }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		path_to_msd = "C:\\Temp\\mxm_dataset_train.txt"