From 9bcbdf629e81aae356fb3cbb2fa15111cc0a99a4 Mon Sep 17 00:00:00 2001 From: Ben Boyter Date: Wed, 9 Jan 2019 08:40:51 +1100 Subject: [PATCH 1/7] Play around with lazy loading language features --- processor/file.go | 15 +++-- processor/processor.go | 144 +++++++++++++++++++++++++---------------- processor/workers.go | 2 + 3 files changed, 98 insertions(+), 63 deletions(-) diff --git a/processor/file.go b/processor/file.go index 28305932c..4870bec10 100644 --- a/processor/file.go +++ b/processor/file.go @@ -113,13 +113,13 @@ func walkDirectoryParallel(root string, output chan *FileJob) { go func(toWalk string) { filejobs := walkDirectory(toWalk, PathBlacklist, extensionLookup) for i := 0; i < len(filejobs); i++ { + mutex.Lock() + LoadLanguageFeature(filejobs[i].Language) + totalCount += len(filejobs) + mutex.Unlock() output <- &filejobs[i] } - mutex.Lock() - totalCount += len(filejobs) - mutex.Unlock() - // Turn GC back to what it was before if we have parsed enough files if !resetGc && totalCount >= GcFileCount { debug.SetGCPercent(gcPercent) @@ -128,7 +128,7 @@ func walkDirectoryParallel(root string, output chan *FileJob) { wg.Done() }(filepath.Join(root, f.Name())) } - } else { + } else { // File processing starts here if gitignoreerror != nil || !gitignore.Match(filepath.Join(root, f.Name()), false) { shouldSkip := false @@ -158,10 +158,13 @@ func walkDirectoryParallel(root string, output chan *FileJob) { } if ok { - output <- &FileJob{Location: filepath.Join(root, f.Name()), Filename: f.Name(), Extension: extension, Language: language} mutex.Lock() + // If we have the extension then load in the features for it + LoadLanguageFeature(language) totalCount++ mutex.Unlock() + + output <- &FileJob{Location: filepath.Join(root, f.Name()), Filename: f.Name(), Extension: extension, Language: language} } else if Verbose { printWarn(fmt.Sprintf("skipping file unknown extension: %s", f.Name())) } diff --git a/processor/processor.go b/processor/processor.go index aa0c1511f..69f5b2db6 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -9,6 +9,7 @@ import ( "runtime/debug" "sort" "strings" + "sync" ) // Flags set via the CLI which control how the output is displayed @@ -42,10 +43,15 @@ var gcPercent = -1 // Not set via flags but by arguments following the the flags var DirFilePaths = []string{} +// Raw database loaded +var database = map[string]Language{} + // Loaded from the JSON that is in constants.go var ExtensionToLanguage = map[string]string{} var LanguageFeatures = map[string]LanguageFeature{} +var LanguageFeaturesLock sync.Mutex + // This needs to be set outside of ProcessConstants because it should only be enabled in command line // mode https://github.com/boyter/scc/issues/32 func ConfigureGc() { @@ -55,7 +61,7 @@ func ConfigureGc() { // ProcessConstants is responsible for setting up the language features based on the JSON file that is stored in constants // Needs to be called at least once in order for anything to actually happen func ProcessConstants() { - var database = loadDatabase() + database = loadDatabase() startTime := makeTimestampNano() for name, value := range database { @@ -68,69 +74,93 @@ func ProcessConstants() { printTrace(fmt.Sprintf("nanoseconds build extension to language: %d", makeTimestampNano()-startTime)) } - startTime = makeTimestampMilli() - for name, value := range database { - complexityTrie := &Trie{} - slCommentTrie := &Trie{} - mlCommentTrie := &Trie{} - stringTrie := &Trie{} - tokenTrie := &Trie{} - - complexityMask := byte(0) - singleLineCommentMask := byte(0) - multiLineCommentMask := byte(0) - stringMask := byte(0) - processMask := byte(0) - - for _, v := range value.ComplexityChecks { - complexityMask |= v[0] - complexityTrie.Insert(T_COMPLEXITY, []byte(v)) - if !Complexity { - tokenTrie.Insert(T_COMPLEXITY, []byte(v)) - } - } - if !Complexity { - processMask |= complexityMask - } + //startTime = makeTimestampMilli() + //for name, value := range database { + // processLanguageFeature(name, value) + //} + // + //if Trace { + // printTrace(fmt.Sprintf("milliseconds build language features: %d", makeTimestampMilli()-startTime)) + //} +} - for _, v := range value.LineComment { - singleLineCommentMask |= v[0] - slCommentTrie.Insert(T_SLCOMMENT, []byte(v)) - tokenTrie.Insert(T_SLCOMMENT, []byte(v)) - } - processMask |= singleLineCommentMask +// Will load a single feature given the name +func LoadLanguageFeature(loadName string) { + // Check if already loaded and if so return because we don't need to do it again + _, ok := LanguageFeatures[loadName] + if ok { + return + } - for _, v := range value.MultiLine { - multiLineCommentMask |= v[0][0] - mlCommentTrie.InsertClose(T_MLCOMMENT, []byte(v[0]), []byte(v[1])) - tokenTrie.InsertClose(T_MLCOMMENT, []byte(v[0]), []byte(v[1])) - } - processMask |= multiLineCommentMask + var name string + var value Language - for _, v := range value.Quotes { - stringMask |= v[0][0] - stringTrie.InsertClose(T_STRING, []byte(v[0]), []byte(v[1])) - tokenTrie.InsertClose(T_STRING, []byte(v[0]), []byte(v[1])) + for name, value = range database { + if name == loadName { + break } - processMask |= stringMask - - LanguageFeatures[name] = LanguageFeature{ - Complexity: complexityTrie, - MultiLineComments: mlCommentTrie, - SingleLineComments: slCommentTrie, - Strings: stringTrie, - Tokens: tokenTrie, - Nested: value.NestedMultiLine, - ComplexityCheckMask: complexityMask, - MultiLineCommentMask: multiLineCommentMask, - SingleLineCommentMask: singleLineCommentMask, - StringCheckMask: stringMask, - ProcessMask: processMask, + } + + processLanguageFeature(loadName, value) +} + +func processLanguageFeature(name string, value Language) { + complexityTrie := &Trie{} + slCommentTrie := &Trie{} + mlCommentTrie := &Trie{} + stringTrie := &Trie{} + tokenTrie := &Trie{} + + complexityMask := byte(0) + singleLineCommentMask := byte(0) + multiLineCommentMask := byte(0) + stringMask := byte(0) + processMask := byte(0) + + for _, v := range value.ComplexityChecks { + complexityMask |= v[0] + complexityTrie.Insert(T_COMPLEXITY, []byte(v)) + if !Complexity { + tokenTrie.Insert(T_COMPLEXITY, []byte(v)) } } + if !Complexity { + processMask |= complexityMask + } - if Trace { - printTrace(fmt.Sprintf("milliseconds build language features: %d", makeTimestampMilli()-startTime)) + for _, v := range value.LineComment { + singleLineCommentMask |= v[0] + slCommentTrie.Insert(T_SLCOMMENT, []byte(v)) + tokenTrie.Insert(T_SLCOMMENT, []byte(v)) + } + processMask |= singleLineCommentMask + + for _, v := range value.MultiLine { + multiLineCommentMask |= v[0][0] + mlCommentTrie.InsertClose(T_MLCOMMENT, []byte(v[0]), []byte(v[1])) + tokenTrie.InsertClose(T_MLCOMMENT, []byte(v[0]), []byte(v[1])) + } + processMask |= multiLineCommentMask + + for _, v := range value.Quotes { + stringMask |= v[0][0] + stringTrie.InsertClose(T_STRING, []byte(v[0]), []byte(v[1])) + tokenTrie.InsertClose(T_STRING, []byte(v[0]), []byte(v[1])) + } + processMask |= stringMask + + LanguageFeatures[name] = LanguageFeature{ + Complexity: complexityTrie, + MultiLineComments: mlCommentTrie, + SingleLineComments: slCommentTrie, + Strings: stringTrie, + Tokens: tokenTrie, + Nested: value.NestedMultiLine, + ComplexityCheckMask: complexityMask, + MultiLineCommentMask: multiLineCommentMask, + SingleLineCommentMask: singleLineCommentMask, + StringCheckMask: stringMask, + ProcessMask: processMask, } } diff --git a/processor/workers.go b/processor/workers.go index 20fcf20cb..d194e7709 100644 --- a/processor/workers.go +++ b/processor/workers.go @@ -337,6 +337,8 @@ func CountStats(fileJob *FileJob) { } } + // Only check the first 10000 characters for null bytes indicating a binary file + // and if we find it then we return otherwise carry on and ignore binary markers if index < 10000 && fileJob.Binary { return } From f0e15cbb250bd822026e4eb757f6b40889fb404f Mon Sep 17 00:00:00 2001 From: Ben Boyter Date: Wed, 9 Jan 2019 08:54:38 +1100 Subject: [PATCH 2/7] Almost there just need to fix some race issues --- main.go | 1 + processor/processor.go | 45 ++++++++++++++++++++++++++---------------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/main.go b/main.go index 5518638a6..bc8872886 100644 --- a/main.go +++ b/main.go @@ -21,6 +21,7 @@ func main() { Run: func(cmd *cobra.Command, args []string) { processor.DirFilePaths = args processor.ConfigureGc() + processor.ConfigureLazy(true) processor.Process() }, } diff --git a/processor/processor.go b/processor/processor.go index 69f5b2db6..05a7eb0f1 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -9,7 +9,6 @@ import ( "runtime/debug" "sort" "strings" - "sync" ) // Flags set via the CLI which control how the output is displayed @@ -39,32 +38,35 @@ var WhiteListExtensions = []string{} var AverageWage int64 = 56286 var GcFileCount = 10000 var gcPercent = -1 +var isLazy = false // Not set via flags but by arguments following the the flags var DirFilePaths = []string{} -// Raw database loaded -var database = map[string]Language{} +// Raw languageDatabase loaded +var languageDatabase = map[string]Language{} // Loaded from the JSON that is in constants.go var ExtensionToLanguage = map[string]string{} var LanguageFeatures = map[string]LanguageFeature{} -var LanguageFeaturesLock sync.Mutex - // This needs to be set outside of ProcessConstants because it should only be enabled in command line // mode https://github.com/boyter/scc/issues/32 func ConfigureGc() { gcPercent = debug.SetGCPercent(gcPercent) } +func ConfigureLazy(lazy bool) { + isLazy = lazy +} + // ProcessConstants is responsible for setting up the language features based on the JSON file that is stored in constants // Needs to be called at least once in order for anything to actually happen func ProcessConstants() { - database = loadDatabase() + languageDatabase = loadDatabase() startTime := makeTimestampNano() - for name, value := range database { + for name, value := range languageDatabase { for _, ext := range value.Extensions { ExtensionToLanguage[ext] = name } @@ -74,18 +76,27 @@ func ProcessConstants() { printTrace(fmt.Sprintf("nanoseconds build extension to language: %d", makeTimestampNano()-startTime)) } - //startTime = makeTimestampMilli() - //for name, value := range database { - // processLanguageFeature(name, value) - //} - // - //if Trace { - // printTrace(fmt.Sprintf("milliseconds build language features: %d", makeTimestampMilli()-startTime)) - //} + // If lazy is set then we want to load in the features as we find them not in one go + // however otherwise being used as a library so just load them all in + if !isLazy { + startTime = makeTimestampMilli() + for name, value := range languageDatabase { + processLanguageFeature(name, value) + } + + if Trace { + printTrace(fmt.Sprintf("milliseconds build language features: %d", makeTimestampMilli()-startTime)) + } + } } -// Will load a single feature given the name +// Will load a single feature as requested given the name +// this is used with lazy loading func LoadLanguageFeature(loadName string) { + if !isLazy { + return + } + // Check if already loaded and if so return because we don't need to do it again _, ok := LanguageFeatures[loadName] if ok { @@ -95,7 +106,7 @@ func LoadLanguageFeature(loadName string) { var name string var value Language - for name, value = range database { + for name, value = range languageDatabase { if name == loadName { break } From 56484bf39b2221daeb0ebfd86ac0ba33f24a1c16 Mon Sep 17 00:00:00 2001 From: Ben Boyter Date: Wed, 9 Jan 2019 08:58:43 +1100 Subject: [PATCH 3/7] Add trace output for lazy language loading --- processor/processor.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/processor/processor.go b/processor/processor.go index 05a7eb0f1..2800773c3 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -87,6 +87,8 @@ func ProcessConstants() { if Trace { printTrace(fmt.Sprintf("milliseconds build language features: %d", makeTimestampMilli()-startTime)) } + } else { + printTrace("configured to lazy load language features") } } @@ -112,7 +114,11 @@ func LoadLanguageFeature(loadName string) { } } + startTime := makeTimestampNano() processLanguageFeature(loadName, value) + if Trace { + printTrace(fmt.Sprintf("nanoseconds to build language %s features: %d", loadName, makeTimestampNano()-startTime)) + } } func processLanguageFeature(name string, value Language) { From 1a79d3a9aaa732657520356379b6bcf20ad4ff52 Mon Sep 17 00:00:00 2001 From: Ben Boyter Date: Wed, 9 Jan 2019 17:29:01 +1100 Subject: [PATCH 4/7] Add lock to resolve any race issues --- processor/processor.go | 6 ++++++ processor/workers.go | 2 ++ 2 files changed, 8 insertions(+) diff --git a/processor/processor.go b/processor/processor.go index 2800773c3..12d7af97b 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -9,6 +9,7 @@ import ( "runtime/debug" "sort" "strings" + "sync" ) // Flags set via the CLI which control how the output is displayed @@ -49,6 +50,7 @@ var languageDatabase = map[string]Language{} // Loaded from the JSON that is in constants.go var ExtensionToLanguage = map[string]string{} var LanguageFeatures = map[string]LanguageFeature{} +var LanguageFeaturesMutex = sync.Mutex{} // This needs to be set outside of ProcessConstants because it should only be enabled in command line // mode https://github.com/boyter/scc/issues/32 @@ -100,7 +102,9 @@ func LoadLanguageFeature(loadName string) { } // Check if already loaded and if so return because we don't need to do it again + LanguageFeaturesMutex.Lock() _, ok := LanguageFeatures[loadName] + LanguageFeaturesMutex.Unlock() if ok { return } @@ -166,6 +170,7 @@ func processLanguageFeature(name string, value Language) { } processMask |= stringMask + LanguageFeaturesMutex.Lock() LanguageFeatures[name] = LanguageFeature{ Complexity: complexityTrie, MultiLineComments: mlCommentTrie, @@ -179,6 +184,7 @@ func processLanguageFeature(name string, value Language) { StringCheckMask: stringMask, ProcessMask: processMask, } + LanguageFeaturesMutex.Unlock() } func processFlags() { diff --git a/processor/workers.go b/processor/workers.go index d194e7709..3d5fe5586 100644 --- a/processor/workers.go +++ b/processor/workers.go @@ -257,7 +257,9 @@ func CountStats(fileJob *FileJob) { return } + LanguageFeaturesMutex.Lock() langFeatures := LanguageFeatures[fileJob.Language] + LanguageFeaturesMutex.Unlock() if langFeatures.Complexity == nil { langFeatures.Complexity = &Trie{} From 4de9cdb3ad0dfbcb0e53e207c704c6727ecebbf3 Mon Sep 17 00:00:00 2001 From: Ben Boyter Date: Wed, 9 Jan 2019 17:30:17 +1100 Subject: [PATCH 5/7] Update version --- main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.go b/main.go index bc8872886..bc6939627 100644 --- a/main.go +++ b/main.go @@ -17,7 +17,7 @@ func main() { Use: "scc", Short: "scc DIRECTORY", Long: "Sloc, Cloc and Code. Count lines of code in a directory with complexity estimation.\nBen Boyter + Contributors", - Version: "2.0.0", + Version: "2.1.0", Run: func(cmd *cobra.Command, args []string) { processor.DirFilePaths = args processor.ConfigureGc() From 1a5af928cf9e23d8f7ee30f57c663fd18c568c3c Mon Sep 17 00:00:00 2001 From: Ben Boyter Date: Wed, 9 Jan 2019 17:42:08 +1100 Subject: [PATCH 6/7] Move feature method out of lock --- processor/file.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/processor/file.go b/processor/file.go index 4870bec10..d42bda137 100644 --- a/processor/file.go +++ b/processor/file.go @@ -114,9 +114,10 @@ func walkDirectoryParallel(root string, output chan *FileJob) { filejobs := walkDirectory(toWalk, PathBlacklist, extensionLookup) for i := 0; i < len(filejobs); i++ { mutex.Lock() - LoadLanguageFeature(filejobs[i].Language) totalCount += len(filejobs) mutex.Unlock() + + LoadLanguageFeature(filejobs[i].Language) output <- &filejobs[i] } @@ -159,11 +160,10 @@ func walkDirectoryParallel(root string, output chan *FileJob) { if ok { mutex.Lock() - // If we have the extension then load in the features for it - LoadLanguageFeature(language) totalCount++ mutex.Unlock() + LoadLanguageFeature(language) output <- &FileJob{Location: filepath.Join(root, f.Name()), Filename: f.Name(), Extension: extension, Language: language} } else if Verbose { printWarn(fmt.Sprintf("skipping file unknown extension: %s", f.Name())) From d6e14d7640b54272af28c996dbc25c73fff06754 Mon Sep 17 00:00:00 2001 From: Ben Boyter Date: Wed, 9 Jan 2019 18:00:41 +1100 Subject: [PATCH 7/7] Move count increment outside of loop --- processor/file.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/processor/file.go b/processor/file.go index d42bda137..7d414af70 100644 --- a/processor/file.go +++ b/processor/file.go @@ -113,14 +113,14 @@ func walkDirectoryParallel(root string, output chan *FileJob) { go func(toWalk string) { filejobs := walkDirectory(toWalk, PathBlacklist, extensionLookup) for i := 0; i < len(filejobs); i++ { - mutex.Lock() - totalCount += len(filejobs) - mutex.Unlock() - LoadLanguageFeature(filejobs[i].Language) output <- &filejobs[i] } + mutex.Lock() + totalCount += len(filejobs) + mutex.Unlock() + // Turn GC back to what it was before if we have parsed enough files if !resetGc && totalCount >= GcFileCount { debug.SetGCPercent(gcPercent)