-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Create project structure #2
base: master
Are you sure you want to change the base?
Changes from 4 commits
362f194
1a339fc
6642b84
fa9bd4c
d3b1a7d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,3 +15,6 @@ project/plugins/project/ | |
# Scala-IDE specific | ||
.scala_dependencies | ||
.worksheet | ||
|
||
# IntelliJ IDEA | ||
/.idea |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
name := "lyrics-engine" | ||
|
||
version := "1.0" | ||
|
||
scalaVersion := "2.11.8" | ||
|
||
|
||
lazy val raw_data_processor = project in file("raw_data_processor") | ||
lazy val data_analyzer = project in file("data_analyzer") | ||
lazy val http_api = project in file("http_api") | ||
|
||
// TODO Setup visibility common dependencies. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
name := "data_analyzer" | ||
|
||
version := "1.0" | ||
|
||
scalaVersion := "2.11.8" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
name := "http_api" | ||
|
||
version := "1.0" | ||
|
||
scalaVersion := "2.11.8" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
name := "raw_data_processor" | ||
|
||
version := "1.0" | ||
|
||
scalaVersion := "2.11.8" | ||
|
||
|
||
libraryDependencies += "com.typesafe" % "config" % "1.3.0" | ||
libraryDependencies += "org.scalatest" % "scalatest_2.11" % "2.2.6" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
sbt.version = 0.13.8 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
logLevel := Level.Warn |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
path_to_msd = "C:\\Temp\\mxm_dataset_train.txt" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package scalalab3.lyricsengine.processor | ||
|
||
/** | ||
* Performs operations above songs. | ||
*/ | ||
object SongProcessor { | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package scalalab3.lyricsengine.processor | ||
|
||
/** | ||
* Performs operations above words. | ||
*/ | ||
object WordsProcessor { | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
package scalalab3.lyricsengine.reader | ||
|
||
|
||
/** | ||
* That class represent data from Million Song Dataset (MSD). | ||
*/ | ||
case class MSD(words: Seq[String], songs: Seq[Song]) | ||
|
||
case class Song(trackId: String, mxmTrackId: String, words: Map[Int, Int]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package scalalab3.lyricsengine.reader | ||
|
||
import com.typesafe.config.ConfigFactory | ||
|
||
import scala.io.Source | ||
|
||
object Reader { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest type names to be more informative |
||
val conf = ConfigFactory.load(); | ||
|
||
def main(args: Array[String]) { | ||
print(extract()) | ||
} | ||
|
||
def extract(): MSD = { | ||
val path = conf.getString("path_to_msd") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest to separate config part from execution part. It will give a bit more flexibility. |
||
|
||
val pair = read(path) | ||
|
||
val words = extractWords(pair._1) | ||
val songs = extractSongs(pair._2) | ||
|
||
new MSD(words, songs) | ||
} | ||
|
||
private def read(path: String) = { | ||
val lines = Source.fromFile(path).getLines().toSeq | ||
val words = lines(17) | ||
val songs = lines.slice(18, lines.size) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Magic values? Could we make the code a bit more general? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Of course i can find data by pattern but, current approach have nice speed and little bit of code. Less code - more understanding.) |
||
(words, songs) | ||
} | ||
|
||
private def extractWords(words: String) = words.substring(1).split(","); | ||
|
||
private def extractSongs(songsData: Seq[String]) = songsData.map(songData => { | ||
val elements = songData.split(",") | ||
val trackId = elements(0) | ||
val mxmTrackId = elements(1) | ||
val words_map = extractSongWords(elements.slice(2, elements.size)) | ||
new Song(trackId, mxmTrackId, words_map); | ||
}) | ||
|
||
private def extractSongWords(words: Array[String]): Map[Int, Int] = words.toSeq.map(source_pair => { | ||
val pair = source_pair.split(":") | ||
assert(pair.size == 2) | ||
pair(0).toInt -> pair(1).toInt | ||
}).toMap; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# TRAINING SET | ||
# MusiXmatch dataset, the official lyrics dataset | ||
# of the Million Song Dataset | ||
# file created on Tue Mar 29 04:28:44 2011 | ||
# contact: T. Bertin-Mahieux (Columbia University) | ||
# [email protected] | ||
# also: http://labrosa.ee.columbia.edu/millionsong/musixmatch | ||
# http://www.musixmatch.com | ||
# FORMAT: | ||
# # - comment, to ignore | ||
# % - list of top words, comma-separated | ||
# - normal line, contains track_id, mxm track id, | ||
# then word count for each of the top words, comma-separated | ||
# word count is in sparse format -> ...,<word idx>:<cnt>,... | ||
# <word idx> starts at 1 (not zero!) | ||
# All our work is done using UTF-8 encoding. | ||
# enjoy! | ||
%i,the,you,to,and,a,me,it,not,in,my,is,of,your,that,do,on,are,we,am,will,all,for,no,be,have,love,so,know,this,but,with,what,just,when,like,now,que,time,can,come,de,there,go,up,oh,la,one,they,out,down,get,she,was,see,if,got,never,from,he,feel,want,let,make,way,say,take,would,as,ca,day,at,babi,away | ||
TRZZZYV128F92E996D,6849828,1:10,2:6,3:20,5:2,7:30,8:1,9:6,10:3,11:19,12:1,13:16,14:6,15:1,18:4,20:2,21:2,23:1,24:4,26:4,31:3,35:1,37:2,50:15,52:21,56:1,57:3,59:3,63:6,68:2,74:3,75:3,84:1,92:1,94:3,100:1,101:2,111:2,121:3,123:5,142:3,144:1,155:1,162:31,167:6,184:3,203:3,216:1,227:1,240:2,261:1,358:1,368:2,372:4,382:3,403:2,415:2,434:1,467:3,485:1,571:1,592:3,627:4,672:6,1023:1,1123:1,1147:1,1247:1,1262:1,1294:1,1375:2,1391:15,1542:1,1619:1,2386:1,4817:3 | ||
TRZZZYX128F92D32C6,681124,1:4,2:18,4:3,5:6,6:9,7:1,8:5,9:1,10:12,11:2,12:9,13:5,15:4,17:2,18:4,19:4,21:2,22:3,25:3,29:2,30:3,32:5,34:3,35:2,37:2,39:3,43:2,44:7,45:2,49:2,50:2,52:8,54:2,56:1,59:1,63:10,64:1,66:1,71:2,72:1,78:1,79:1,84:2,91:1,92:2,95:1,96:1,103:1,110:1,115:1,127:1,134:4,135:2,136:3,137:1,138:2,140:1,146:2,150:1,152:1,192:1,206:1,207:1,222:3,239:1,248:2,258:1,270:1,274:1,283:1,294:2,300:1,305:1,318:1,337:1,338:1,346:1,347:1,349:1,371:1,398:1,406:1,445:1,451:1,459:1,478:1,487:1,492:1,502:3,516:1,539:1,548:2,549:1,553:1,592:2,617:1,633:1,666:1,681:1,727:1,774:1,775:7,789:1,811:1,844:1,915:1,942:1,950:1,979:1,1008:1,1040:1,1080:9,1142:1,1232:1,1366:1,1409:1,1412:1,1537:1,1545:1,1597:1,1841:2,1861:1,1901:1,1907:1,2063:1,2097:1,2167:1,2198:1,2221:1,2468:1,2498:1,2595:1,2698:1,2975:1,2990:1,2996:1,3256:1,3267:1,3316:2,3355:1,4198:1,4356:1,4738:1,4845:1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import org.scalatest.{FlatSpec, Matchers} | ||
|
||
class ReaderTest extends FlatSpec with Matchers { | ||
|
||
"A MSD" should "contains" in { | ||
// val raw = Source.fromInputStream(getClass.getResourceAsStream("/test_msd.txt")).mkString | ||
// Reader.extract("/test_msd.txt") | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have some limits on type/variable names? :) Why MSD, and not MillionSongDataset?
And a bit more general question? What will happen if we realize in a month, that this Millon Song Dataset does not conform all our needs? And, lets imagine we will find (or generate) Billon Songs Dataset (another metadata, or at least size, which could not be loaded to RAM directly)? So, could you please comment a bit about the idea of this class?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. Idea of this class (MDS), representation data from specific format, loaded by: http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/mxm_dataset_train.txt.zip
For other types of data we will write other handler. :)
I agree, we need use more informative naming.