Skip to content
This repository has been archived by the owner on Aug 20, 2024. It is now read-only.

Commit

Permalink
Merge pull request #72 from nvnieuwk/duplicate-checking
Browse files Browse the repository at this point in the history
Duplicate checking
  • Loading branch information
nvnieuwk authored Jul 10, 2023
2 parents b4c5810 + cf258c8 commit a288a90
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 7 deletions.
6 changes: 4 additions & 2 deletions docs/samplesheets/fromSamplesheet.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ This parameter be described in the Nextflow parameter schema using as a file, wi

The given sample sheet schema specified in the `schema` key is then loaded and used for validation and sample sheet generation.

An additional function optional argument can be used:
Some additional function optional arguments can be used:

- File name for the pipeline parameters schema. (Default: `nextflow_schema.json`)
- `schema_filename`: File name for the pipeline parameters schema. (Default: `nextflow_schema.json`)
- `skip_duplicate_check`: Skip the checking for duplicates. Can also be skipped with the `--validationSkipDuplicateCheck` parameter. (Default: `false`)

```groovy
Channel.fromSamplesheet('input')
Expand All @@ -35,6 +36,7 @@ Channel.fromSamplesheet('input')
Channel.fromSamplesheet(
'input',
schema_filename: 'custom_nextflow_schema.json',
skip_duplicate_check: false
)
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class SamplesheetConverter {
private static List<String> schemaErrors = []
private static List<String> warnings = []

private static List<Map> rows = []

static boolean hasErrors() { errors.size()>0 }
static Set<String> getErrors() { errors.sort().collect { "\t${it}".toString() } as Set }

Expand All @@ -54,7 +56,8 @@ class SamplesheetConverter {

static List convertToList(
Path samplesheetFile,
Path schemaFile
Path schemaFile,
Boolean skipDuplicateCheck
) {

def Map schemaMap = (Map) new JsonSlurper().parseText(schemaFile.text)
Expand All @@ -78,14 +81,15 @@ class SamplesheetConverter {
def Map<String,List<String>> booleanUniques = [:]
def Map<String,List<Map<String,String>>> listUniques = [:]
def Boolean headerCheck = true
this.rows = []
resetCount()

def List outputs = samplesheetList.collect { Map<String,String> fullRow ->
increaseCount()

Map<String,String> row = fullRow.findAll { it.value != "" }
def Set rowKeys = row.keySet()
def String yamlInfo = fileType == "yaml" ? " for sample ${this.getCount()}." : ""
def String yamlInfo = fileType == "yaml" ? " for entry ${this.getCount()}." : ""

// Check the header (CSV/TSV) or present fields (YAML)
if(headerCheck) {
Expand All @@ -99,6 +103,13 @@ class SamplesheetConverter {
}
}

// Check for row uniqueness
if(!skipDuplicateCheck && this.rows.contains(row)) {
def Integer firstDuplicate = this.rows.findIndexOf { it == row }
this.errors << "The samplesheet contains duplicate rows for entry ${firstDuplicate + 1} and entry ${getCount()} (${row})".toString()
}
this.rows.add(row)

def Map meta = [:]
def ArrayList output = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ class SchemaValidator extends PluginExtensionPoint {

// Set defaults for optional inputs
def String schemaFilename = options?.containsKey('schema_filename') ? options.schema_filename as String : 'nextflow_schema.json'
def Boolean skipDuplicateCheck = options?.containsKey('skip_duplicate_check') ? options.skip_duplicate_check as Boolean : params.validationSkipDuplicateCheck ? params.validationSkipDuplicateCheck as Boolean : false

def slurper = new JsonSlurper()
def Map parsed = (Map) slurper.parse( Path.of(getSchemaPath(baseDir, schemaFilename)) )
Expand Down Expand Up @@ -194,7 +195,7 @@ class SchemaValidator extends PluginExtensionPoint {

// Convert to channel
final channel = CH.create()
List arrayChannel = SamplesheetConverter.convertToList(samplesheetFile, schemaFile)
List arrayChannel = SamplesheetConverter.convertToList(samplesheetFile, schemaFile, skipDuplicateCheck)
session.addIgniter {
arrayChannel.each {
channel.bind(it)
Expand Down Expand Up @@ -227,6 +228,9 @@ class SchemaValidator extends PluginExtensionPoint {
if( !params.containsKey("validationSchemaIgnoreParams") ) {
params.validationSchemaIgnoreParams = false
}
if( !params.containsKey("validationSkipDuplicateCheck") ) {
params.validationSkipDuplicateCheck = false
}

return params
}
Expand All @@ -236,7 +240,15 @@ class SchemaValidator extends PluginExtensionPoint {
// Add expected params
//
List addExpectedParams() {
def List expectedParams = ["validationFailUnrecognisedParams", "validationLenientMode", "monochrome_logs", "help", "validationShowHiddenParams", "validationSchemaIgnoreParams"]
def List expectedParams = [
"validationFailUnrecognisedParams",
"validationLenientMode",
"monochrome_logs",
"help",
"validationShowHiddenParams",
"validationSchemaIgnoreParams",
"validationSkipDuplicateCheck"
]

return expectedParams
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ class SamplesheetConverterTest extends Dsl2Spec{
!stdout
}

def 'errors before channel conversion' () {
def 'errors before channel conversion' () {
given:
def SCRIPT_TEXT = '''
include { fromSamplesheet } from 'plugin/nf-validation'
Expand Down Expand Up @@ -250,4 +250,31 @@ class SamplesheetConverterTest extends Dsl2Spec{
errorMessages[12] == "* -- Entry 3 - field_2: expected type: Integer, found: String (false)"
!stdout
}

def 'duplicates' () {
given:
def SCRIPT_TEXT = '''
include { fromSamplesheet } from 'plugin/nf-validation'
params.input = 'src/testResources/duplicate.csv'
workflow {
Channel.fromSamplesheet("input", schema_filename:"src/testResources/nextflow_schema_with_samplesheet_converter.json").view()
}
'''

when:
dsl_eval(SCRIPT_TEXT)
def stdout = capture
.toString()
.readLines()
.findResults {it.startsWith('[[') ? it : null }

then:
def error = thrown(SchemaValidationException)
def errorMessages = error.message.readLines()
errorMessages[0] == "Samplesheet errors:"
errorMessages[4] == "\tThe samplesheet contains duplicate rows for entry 2 and entry 3 ([field_4:string1, field_5:25, field_6:false])"
!stdout
}
}
4 changes: 4 additions & 0 deletions plugins/nf-validation/src/testResources/duplicate.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,field_9,field_10,field_11
fullField,10,true,string1,25,false,src/testResources/test.txt,src/testResources/testDir,src/testResources/test.txt,unique1,1
,,,string1,25,false,,,,,,
,,,string1,25,false,,,,,,

0 comments on commit a288a90

Please sign in to comment.