Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create project structure #2

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ project/plugins/project/
# Scala-IDE specific
.scala_dependencies
.worksheet

# IntelliJ IDEA
/.idea
12 changes: 12 additions & 0 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name := "lyrics-engine"

version := "1.0"

scalaVersion := "2.11.8"


lazy val raw_data_processor = project in file("raw_data_processor")
lazy val data_analyzer = project in file("data_analyzer")
lazy val http_api = project in file("http_api")

// TODO Setup visibility common dependencies.
8 changes: 8 additions & 0 deletions data_analyzer/build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name := "data_analyzer"

version := "1.0"

scalaVersion := "2.11.8"

//libraryDependencies += "org.apache.spark" % "spark-core_2.10" % "1.6.1"
//libraryDependencies += "org.apache.spark" % "spark-mllib_2.10" % "1.6.1"
5 changes: 5 additions & 0 deletions http_api/build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name := "http_api"

version := "1.0"

scalaVersion := "2.11.8"
26 changes: 26 additions & 0 deletions raw_data_processor/build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name := "raw_data_processor"

version := "1.0"

scalaVersion := "2.11.8"

// -------- -------- Dependencies: -------- --------

// -------- Config:
// https://github.com/typesafehub/config
libraryDependencies += "com.typesafe" % "config" % "1.3.0"

// -------- Logger:
// http://logback.qos.ch/
libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.1.7"
// https://github.com/typesafehub/scala-logging
libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.4.0"

// -------- Test:
// http://www.scalatest.org/
libraryDependencies += "org.scalatest" % "scalatest_2.11" % "2.2.6"

// -------- DB:
// mongo:
// http://mongodb.github.io/mongo-scala-driver/1.1/
libraryDependencies += "org.mongodb.scala" % "mongo-scala-driver_2.11" % "1.1.0"
1 change: 1 addition & 0 deletions raw_data_processor/project/build.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sbt.version = 0.13.8
1 change: 1 addition & 0 deletions raw_data_processor/project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
logLevel := Level.Warn
12 changes: 12 additions & 0 deletions raw_data_processor/src/main/resources/application.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// ---- ---- Million Song Dataset data:
// subset from: http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/mxm_dataset_train.txt.zip
msd.pathToMsd = "C:\\Temp\\mxm_dataset_train.txt"

// ---- ---- DB:
// mongoDB:
// Nice mongo admin: https://github.com/mrvautin/adminMongo
// Short introduction to mongoDB installation:
// http://stackoverflow.com/questions/26585433/mongodb-failed-to-connect-to-127-0-0-127017-reason-errno10061
mongoDb.host = localhost
mongoDb.port = 27017
mongoDb.dbName = liric
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package scalalab3.lyricsengine.processor

/**
* Performs operations above songs.
*/
object SongProcessor {
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package scalalab3.lyricsengine.processor

/**
* Performs operations above words.
*/
object WordsProcessor {
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package scalalab3.lyricsengine.reader

import scala.io.Source

object MillionSongDataSubSetReader {
def extract(path: String): MillionSongDataSubSet = {
val wordsSongsPair = readRawData(path)
new MillionSongDataSubSet(extractWords(wordsSongsPair._1), extractSongs(wordsSongsPair._2))
}

private def readRawData(path: String) = {
val lines = Source.fromFile(path).getLines().toSeq
// 17 - Line that contains the words.
val words = lines(17)
// 18 - First line that contains the data by songs.
val songs = lines.slice(18, lines.size)
words -> songs
}

private def extractWords(words: String) = words.substring(1).split(",");

private def extractSongs(songsData: Seq[String]) = songsData.map(
songData => {
val elements = songData.split(",")
val trackId = elements(0)
val mxmTrackId = elements(1)
val words_map = extractSongWords(elements.slice(2, elements.size))
new Song(trackId, mxmTrackId, words_map);
}).toSeq

private def extractSongWords(words: Array[String]) = words.toSeq.map(
pairInStringForm => {
val pair = pairInStringForm.split(":")
val wordIndex = pair(0).toInt
val count = pair(1).toInt
wordIndex -> count
}).toMap
}

/**
* That class represent subset of data from Million Song Dataset (MSD).
* http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/mxm_dataset_train.txt.zip
*/
case class MillionSongDataSubSet(words: Seq[String], songs: Seq[Song])

case class Song(trackId: String, mxmTrackId: String, words: Map[Int, Int])



Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package scalalab3.lyricsengine.repositories


import scalalab3.lyricsengine.system.{ProjectConfig, ProjectLogger}

/**
* Repository for mongoDB
*/
// TODO: move to API?
class MillionSongDataSubSetRepository extends ProjectConfig with ProjectLogger {
def saveSongs(songs: Any) = ???

def saveWords(songs: Any) = ???


def findAll() = ???
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package scalalab3.lyricsengine.system

import com.typesafe.config.ConfigFactory


trait ProjectConfig {
val conf = ConfigFactory.load()
// Config variable:
// ---- Million Song Dataset data:
val PATH_TO_MSD_SUBSET_VAR_NAME = "msd.pathToMsd"
// ---- MongoDB:
val MONGODB_HOST_VAR_NAME = "mongoDb.host"
val MONGODB_PORT_VAR_NAME = "mongoDb.port"
val MONGODB_DB_NAME_VAR_NAME = "mongoDb.dbName"


def getPathToData() = conf.getString(PATH_TO_MSD_SUBSET_VAR_NAME)

def getMongoDbHost() = conf.getString(MONGODB_HOST_VAR_NAME)

def getMongoDbPort() = conf.getInt(MONGODB_PORT_VAR_NAME)

def getMongoDbName() = conf.getString(MONGODB_DB_NAME_VAR_NAME)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package scalalab3.lyricsengine.system

import com.typesafe.scalalogging.Logger
import org.slf4j.LoggerFactory

trait ProjectLogger {
val logger = Logger(LoggerFactory.getLogger("common"))
}
20 changes: 20 additions & 0 deletions raw_data_processor/src/test/resources/test_msd.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# TRAINING SET
# MusiXmatch dataset, the official lyrics dataset
# of the Million Song Dataset
# file created on Tue Mar 29 04:28:44 2011
# contact: T. Bertin-Mahieux (Columbia University)
# [email protected]
# also: http://labrosa.ee.columbia.edu/millionsong/musixmatch
# http://www.musixmatch.com
# FORMAT:
# # - comment, to ignore
# % - list of top words, comma-separated
# - normal line, contains track_id, mxm track id,
# then word count for each of the top words, comma-separated
# word count is in sparse format -> ...,<word idx>:<cnt>,...
# <word idx> starts at 1 (not zero!)
# All our work is done using UTF-8 encoding.
# enjoy!
%i,the,you,to,and
TRZZZYV128F92E996D,6849828,1:10,2:6,3:20,5:2,7:30
TRZZZYX128F92D32C6,681124,1:4,2:18,4:3,5:6,6:9
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import org.scalatest.{FlatSpec, Matchers}

import scalalab3.lyricsengine.reader.{MillionSongDataSubSet, MillionSongDataSubSetReader, Song}

class MillionSongDataSubSetReaderTest extends FlatSpec with Matchers {

"A MillionSongDataSubSetReader" should "correct load all data" in {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"load all data correctly" (неправильно построено анлийское предложение).

// --- Prepare expected data:
val expectedData: MillionSongDataSubSet = prepareTestData();

// --- Computation actual data:
val path = getClass.getResource("/test_msd.txt").getPath
val actualData: MillionSongDataSubSet = MillionSongDataSubSetReader.extract(path)

// --- Checking data:
assert(expectedData.words == actualData.words)
assert(expectedData.songs.size == actualData.songs.size)
actualData.songs(0) == expectedData.songs(0)
actualData.songs(1) == expectedData.songs(1)
//Just in case.
assert(expectedData.songs == actualData.songs)
}

def prepareTestData() = {
val words = List("i", "the", "you", "to", "and")
val firstSong = new Song("TRZZZYV128F92E996D", "6849828", Map(1 -> 10, 2 -> 6, 3 -> 20, 5 -> 2, 7 -> 30))
val secondSong = new Song("TRZZZYX128F92D32C6", "681124", Map(1 -> 4, 2 -> 18, 4 -> 3, 5 -> 6, 6 -> 9))
new MillionSongDataSubSet(words, Seq(firstSong, secondSong))
}
}