SearchEngine - My first scala project

Small file search engine using inverted index, powered by Slick &Akka&Scala-Swing Facing to a task that

Build a basic search tool which allows the user to select a folder. Then for each query (list of terms), the search tool returns all the files under the input folder that contains all these terms. and Manipulating a synonym database from init file

Output of full inverted index map list as below

entry of inverted index map format like word -> Map(doc_id -> (occurrences, index_list)

forecast -> Map(1 -> (1,ArrayBuffer(19)))
sale -> Map(2 -> (6,ArrayBuffer(5, 24, 30, 41, 60, 66)), 4 -> (1,ArrayBuffer(15)), 1 -> (1,ArrayBuffer(9)), 3 -> (1,ArrayBuffer(18)))
rise -> Map(2 -> (2,ArrayBuffer(11, 47)), 4 -> (1,ArrayBuffer(21)))
top -> Map(1 -> (1,ArrayBuffer(15)))
in -> Map(3 -> (2,ArrayBuffer(1, 10)), 2 -> (2,ArrayBuffer(16, 52)))
juli -> Map(2 -> (2,ArrayBuffer(19, 55)), 4 -> (1,ArrayBuffer(1)), 3 -> (1,ArrayBuffer(27)))
home -> Map(2 -> (2,ArrayBuffer(0, 36)), 4 -> (1,ArrayBuffer(10)), 1 -> (1,ArrayBuffer(4)), 3 -> (1,ArrayBuffer(13)))
new -> Map(4 -> (1,ArrayBuffer(6)), 1 -> (1,ArrayBuffer(0)))
increas -> Map(3 -> (1,ArrayBuffer(1)))
ascend -> Map(1 -> (3,ArrayBuffer(29, 36, 43)))

Core code to build full inverted index as below

object InvertedIndexHelper extends App {

  var filePath2ID = Map[String,String]()
  var ID2FilePath = Map[String,String]()
  
  type word_occ_indexes = (String, Int, ArrayBuffer[Int])
  type invertedIndexMap = Map[String, Map[String, (Int, ArrayBuffer[Int])]]

  def buildupInvertedMap(files: Seq[String], f: (word_occ_indexes) => (word_occ_indexes)):invertedIndexMap = {
    // Map of (filePath -> one line of content)
    var fileID = 0
    val fileToContentMap = files.map { filePath =>
      val sourceFile = Source.fromFile(filePath)
      //reformat file content to one line, which makes easier to count word indexes
      val oneLine = sourceFile.getLines().mkString(" ") 
      sourceFile.close()
      //generate filePath -> id
      fileID+=1
      filePath2ID += (filePath -> fileID.toString)
      (fileID.toString -> oneLine)
    }
    //invert the map
    ID2FilePath = filePath2ID.map(_.swap)

    val unsortedResult = fileToContentMap.map {
      //convert to Iterable of (file -> (word,frequency, array of index))
      case (file, content) => (file -> lineToTuple(content).map(f(_)))
    }.foldLeft(Map.empty[String, Map[String, (Int, ArrayBuffer[Int])]]) {
      (map, file_content) => {
        file_content._2.foreach { word_freq_indexes =>
          //map add an entry which is (word -> entry of (filePath -> (frequency, Array of index)))
          val indexMapValue = (map.getOrElse(word_freq_indexes._1, Map()) += (file_content._1 -> (word_freq_indexes._2, word_freq_indexes._3)))
          map += (word_freq_indexes._1 -> indexMapValue)
        }
        map
      }
    }

    unsortedResult.map {case (word, listMap) =>
        //sort by word frequency
        val newListMap = ListMap(listMap.toSeq.sortBy(_._2._1): _*)
        (word -> newListMap)
      }

  }


  private def lineToTuple(line: String): Iterable[(String, Int, ArrayBuffer[Int])] = {
    val ss = Map.empty[String, (String, Int, ArrayBuffer[Int])]
    line.trim.split("\\W+").foreach { word =>
      ss(word) = multiIndexOf(line, word, ss.getOrElse(word, (word, 0, ArrayBuffer[Int]())))
    }
    ss.values
  }

  private def multiIndexOf(line: String, word: String, acc: (String, Int, ArrayBuffer[Int])) = {
    if (acc._1 == word) {
      val nextIndex = if (acc._3.isEmpty) line.indexOf(word) else line.indexOf(word, acc._3.last + word.size)
      val freq = acc._2 + 1
      acc._3.append(nextIndex)
      (word, freq, acc._3)
    } else {
      acc
    }
  }


  val prefix = "fileAbsolutePath"
  val input = List(prefix + "doc1.txt", prefix + "doc2.txt", prefix + "doc3.txt", prefix + "doc4.txt")
  println(buildupInvertedMap(input, Stemming.doStem _).mkString("\n"))

}

Unit test covered by

scalaTest & akka-testkit


class SearchActorSpec extends TestKit(ActorSystem(
  "SearchActorSpec",
  ConfigFactory.parseString(SearchActorSpec.config)))
  with DefaultTimeout with ImplicitSender
  with WordSpecLike with Matchers with BeforeAndAfterAll{

  val searchActor = system.actorOf(Props[SearchActor],name="searchActor")
  val searchFile = new File(getClass.getResource("/doc4.txt").getFile)

  val searchActorRef = TestActorRef(new SearchActor)

  override def afterAll(): Unit = shutdown()

  "SearchActor" should {
    "respond to the SearchMessage " in {
      within(500 millis) {
        searchActor ! SearchMessage(searchFile.getParentFile,"forecast")
        expectMsgPF(500 millis){
          case msg @ (a:List[String],b:String) =>
            a should have size 1
            b should not be empty
        }
      }
    }

    "throw exception" in {
      intercept[Exception](searchActorRef.receive("error"))
    }
  }
}


object SearchActorSpec {
  val config =
    """
      |akka {
      |   loglevel = "WARNING"
      |}
    """.stripMargin
}

Stemming the words by

The Porter Stemming Algorithm

Reference to inverted index

elastic - basic concepts URL A first take at building an inverted index Full inverted index

Name		Name	Last commit message	Last commit date
Latest commit History 44 Commits
project		project
src		src
.gitignore		.gitignore
README.md		README.md
build.sbt		build.sbt

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

SearchEngine - My first scala project

Output of full inverted index map list as below

Core code to build full inverted index as below

Unit test covered by

Stemming the words by

Reference to inverted index

About

Releases

Packages

Languages

flying3615/SearchEngine

Folders and files

Latest commit

History

Repository files navigation

SearchEngine - My first scala project

Output of full inverted index map list as below

Core code to build full inverted index as below

Unit test covered by

Stemming the words by

Reference to inverted index

About

Topics

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages