diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 7b591a2..030cd10 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -13,11 +13,11 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: 3.11 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index 9dc019a..3766e8c 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -15,13 +15,13 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ] + python-version: [ '3.9', '3.10', '3.11', '3.12' ] name: Python ${{ matrix.python-version }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eed031a..e60a5f4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,18 +18,18 @@ repos: args: ['--fix=auto'] # replace 'auto' with 'lf' to enforce Linux/Mac line endings or 'crlf' for Windows # - repo: https://github.com/PyCQA/docformatter -# rev: v1.7.5 +# rev: master # hooks: # - id: docformatter # additional_dependencies: [tomli] # args: [--in-place, --wrap-descriptions=120, --wrap-summaries=120] # # --config, ./pyproject.toml -- repo: https://github.com/psf/black - rev: 24.8.0 - hooks: - - id: black - language_version: python3 +# - repo: https://github.com/psf/black +# rev: 24.8.0 +# hooks: +# - id: black +# language_version: python3 - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. @@ -37,6 +37,7 @@ repos: hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format ## If like to embrace black styles even in the docs: # - repo: https://github.com/asottile/blacken-docs diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f259e9..f98a970 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.5.0 + +- chore: Remove Python 3.8 (EOL) +- precommit: Replace docformatter with ruff's formatter + ## Version 0.4.32 - 0.4.33 - Bump IRanges package version to fix coercion issues to pandas. diff --git a/pyproject.toml b/pyproject.toml index a7cea75..00aa968 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,10 @@ extend-ignore = ["F821"] [tool.ruff.pydocstyle] convention = "google" +[tool.ruff.format] +docstring-code-format = true +docstring-code-line-length = 20 + [tool.ruff.per-file-ignores] "__init__.py" = ["E402", "F401"] diff --git a/setup.cfg b/setup.cfg index 64f5187..2d3326e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ package_dir = =src # Require a min/specific Python version (comma-separated conditions) -python_requires = >=3.8 +python_requires = >=3.9 # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in diff --git a/src/genomicranges/GenomicRanges.py b/src/genomicranges/GenomicRanges.py index b770afe..7d0101d 100644 --- a/src/genomicranges/GenomicRanges.py +++ b/src/genomicranges/GenomicRanges.py @@ -47,9 +47,7 @@ def _validate_seqnames(seqnames, seqinfo, num_ranges): _l = len(seqinfo) if (seqnames > _l).any(): - raise ValueError( - "'seqnames' contains sequence name not represented in 'seqinfo'." - ) + raise ValueError("'seqnames' contains sequence name not represented in 'seqinfo'.") def _validate_ranges(ranges, num_ranges): @@ -107,11 +105,7 @@ def __iter__(self): def __next__(self): if self._current_index < len(self._gr): - iter_row_index = ( - self._gr.names[self._current_index] - if self._gr.names is not None - else None - ) + iter_row_index = self._gr.names[self._current_index] if self._gr.names is not None else None iter_slice = self._gr[self._current_index] self._current_index += 1 @@ -203,14 +197,10 @@ def __init__( _num_ranges = _guess_num_ranges(self._seqnames, self._ranges) _validate_ranges(self._ranges, _num_ranges) _validate_seqnames(self._seqnames, self._seqinfo, _num_ranges) - _validate_optional_attrs( - self._strand, self._mcols, self._names, _num_ranges - ) + _validate_optional_attrs(self._strand, self._mcols, self._names, _num_ranges) def _build_reverse_seqindex(self, seqinfo: SeqInfo): - self._reverse_seqindex = ut.reverse_index.build_reverse_index( - seqinfo.get_seqnames() - ) + self._reverse_seqindex = ut.reverse_index.build_reverse_index(seqinfo.get_seqnames()) def _remove_reverse_seqindex(self): del self._reverse_seqindex @@ -400,9 +390,7 @@ def __str__(self) -> str: data = self._mcols.column(col) showed = ut.show_as_cell(data, indices) header = [col, "<" + ut.print_type(data) + ">"] - showed = ut.truncate_strings( - showed, width=max(40, len(header[0]), len(header[1])) - ) + showed = ut.truncate_strings(showed, width=max(40, len(header[0]), len(header[1]))) if insert_ellipsis: showed = showed[:3] + ["..."] + showed[3:] columns.append(header + showed) @@ -469,9 +457,7 @@ def get_seqnames( else: raise ValueError("Argument 'as_type' must be 'factor' or 'list'.") - def set_seqnames( - self, seqnames: Union[Sequence[str], np.ndarray], in_place: bool = False - ) -> "GenomicRanges": + def set_seqnames(self, seqnames: Union[Sequence[str], np.ndarray], in_place: bool = False) -> "GenomicRanges": """Set new sequence names. Args: @@ -490,9 +476,7 @@ def set_seqnames( _validate_seqnames(seqnames, len(self)) if not isinstance(seqnames, np.ndarray): - seqnames = np.asarray( - [self._seqinfo.get_seqnames().index(x) for x in list(seqnames)] - ) + seqnames = np.asarray([self._seqinfo.get_seqnames().index(x) for x in list(seqnames)]) output = self._define_output(in_place) output._seqnames = seqnames @@ -873,9 +857,7 @@ def set_metadata(self, metadata: dict, in_place: bool = False) -> "GenomicRanges or as a reference to the (in-place-modified) original. """ if not isinstance(metadata, dict): - raise TypeError( - f"`metadata` must be a dictionary, provided {type(metadata)}." - ) + raise TypeError(f"`metadata` must be a dictionary, provided {type(metadata)}.") output = self._define_output(in_place) output._metadata = metadata return output @@ -1127,9 +1109,7 @@ def from_pandas(cls, input: "pandas.DataFrame") -> "GenomicRanges": if input.index is not None: names = [str(i) for i in input.index.to_list()] - return cls( - ranges=ranges, seqnames=seqnames, strand=strand, names=names, mcols=mcols - ) + return cls(ranges=ranges, seqnames=seqnames, strand=strand, names=names, mcols=mcols) ################################ ######>> polars interop <<###### @@ -1144,9 +1124,7 @@ def to_polars(self) -> "polars.DataFrame": import polars as pl _rdf = self._ranges.to_polars() - _rdf = _rdf.with_columns( - seqnames=self.get_seqnames(), strand=self.get_strand(as_type="list") - ) + _rdf = _rdf.with_columns(seqnames=self.get_seqnames(), strand=self.get_strand(as_type="list")) if self._names is not None: _rdf = _rdf.with_columns(rownames=self._names) @@ -1210,9 +1188,7 @@ def from_polars(cls, input: "polars.DataFrame") -> "GenomicRanges": names = None - return cls( - ranges=ranges, seqnames=seqnames, strand=strand, names=names, mcols=mcols - ) + return cls(ranges=ranges, seqnames=seqnames, strand=strand, names=names, mcols=mcols) ##################################### ######>> intra-range methods <<###### @@ -1282,9 +1258,7 @@ def flank( # figure out which position to pin, start or end? start_flags = np.repeat(start, len(all_strands)) if not ignore_strand: - start_flags = [ - start != (all_strands[i] == -1) for i in range(len(all_strands)) - ] + start_flags = [start != (all_strands[i] == -1) for i in range(len(all_strands))] new_starts = [] new_widths = [] @@ -1295,9 +1269,7 @@ def flank( sf = start_flags[idx] tstart = 0 if both is True: - tstart = ( - all_starts[idx] - abs(width) if sf else all_ends[idx] - abs(width) - ) + tstart = all_starts[idx] - abs(width) if sf else all_ends[idx] - abs(width) else: if width >= 0: tstart = all_starts[idx] - abs(width) if sf else all_ends[idx] @@ -1356,9 +1328,7 @@ def resize( output._ranges = self._ranges.resize(width=width, fix=fix) return output - def shift( - self, shift: Union[int, List[int], np.ndarray] = 0, in_place: bool = False - ) -> "GenomicRanges": + def shift(self, shift: Union[int, List[int], np.ndarray] = 0, in_place: bool = False) -> "GenomicRanges": """Shift all intervals. Args: @@ -1382,9 +1352,7 @@ def shift( output._ranges = self._ranges.shift(shift=shift) return output - def promoters( - self, upstream: int = 2000, downstream: int = 200, in_place: bool = False - ) -> "GenomicRanges": + def promoters(self, upstream: int = 2000, downstream: int = 200, in_place: bool = False) -> "GenomicRanges": """Extend intervals to promoter regions. Generates promoter ranges relative to the transcription start site (TSS), @@ -1419,21 +1387,13 @@ def promoters( new_starts = np.asarray( [ - ( - all_starts[idx] - upstream - if start_flags[idx] - else all_ends[idx] - downstream - ) + (all_starts[idx] - upstream if start_flags[idx] else all_ends[idx] - downstream) for idx in range(len(start_flags)) ] ) new_ends = np.asarray( [ - ( - all_starts[idx] + downstream - if start_flags[idx] - else all_ends[idx] + upstream - ) + (all_starts[idx] + downstream if start_flags[idx] else all_ends[idx] + upstream) for idx in range(len(start_flags)) ] ) @@ -1471,9 +1431,7 @@ def restrict( (in-place-modified) original. """ - restricted_ir = self._ranges.restrict( - start=start, end=end, keep_all_ranges=True - ) + restricted_ir = self._ranges.restrict(start=start, end=end, keep_all_ranges=True) output = self._define_output(in_place) output._ranges = restricted_ir @@ -1518,11 +1476,7 @@ def trim(self, in_place: bool = False) -> "GenomicRanges": _t_chr = all_chrs[i] _end = all_ends[i] - if ( - is_circular is not None - and is_circular[_t_chr] is False - and _end > seqlengths[_t_chr] - ): + if is_circular is not None and is_circular[_t_chr] is False and _end > seqlengths[_t_chr]: _end = seqlengths[_t_chr] + 1 new_ends.append(_end) @@ -1561,16 +1515,11 @@ def narrow( (in-place-modified) original. """ if start is not None and end is not None and width is not None: - raise ValueError( - "Only provide two of the three parameters - `start`, " - "`end` and `width` but not all!" - ) + raise ValueError("Only provide two of the three parameters - `start`, " "`end` and `width` but not all!") if width is not None: if start is None and end is None: - raise ValueError( - "If width is provided, either start or end must be provided." - ) + raise ValueError("If width is provided, either start or end must be provided.") narrow_ir = self._ranges.narrow(start=start, end=end, width=width) output = self._define_output(in_place) @@ -1585,15 +1534,10 @@ def _group_indices_by_chrm(self, ignore_strand: bool = False) -> dict: # __strand[__strand == 0] = 1 _seqnames = [self._seqinfo._seqnames[i] for i in self._seqnames] - grp_keys = np.char.add( - np.char.add(_seqnames, f"{_granges_delim}"), __strand.astype(str) - ) + grp_keys = np.char.add(np.char.add(_seqnames, f"{_granges_delim}"), __strand.astype(str)) unique_grps, inverse_indices = np.unique(grp_keys, return_inverse=True) - chrm_grps = { - str(grp): np.where(inverse_indices == i)[0].tolist() - for i, grp in enumerate(unique_grps) - } + chrm_grps = {str(grp): np.where(inverse_indices == i)[0].tolist() for i, grp in enumerate(unique_grps)} return chrm_grps @@ -1662,9 +1606,7 @@ def reduce( new_seqnames = [x[0] for x in splits] new_strand = np.asarray([int(x[1]) for x in splits]) - output = GenomicRanges( - seqnames=new_seqnames, strand=new_strand, ranges=all_merged_ranges - ) + output = GenomicRanges(seqnames=new_seqnames, strand=new_strand, ranges=all_merged_ranges) if with_reverse_map is True: output._mcols.set_column("revmap", rev_map, in_place=True) @@ -1673,9 +1615,7 @@ def reduce( return output - def range( - self, with_reverse_map: bool = False, ignore_strand: bool = False - ) -> "GenomicRanges": + def range(self, with_reverse_map: bool = False, ignore_strand: bool = False) -> "GenomicRanges": """Calculate range bounds for each distinct (seqname, strand) pair. Args: @@ -1713,9 +1653,7 @@ def range( new_seqnames = [x[0] for x in splits] new_strand = np.asarray([int(x[1]) for x in splits]) - output = GenomicRanges( - seqnames=new_seqnames, strand=new_strand, ranges=all_merged_ranges - ) + output = GenomicRanges(seqnames=new_seqnames, strand=new_strand, ranges=all_merged_ranges) if with_reverse_map is True: output._mcols.set_column("revmap", rev_map, in_place=True) @@ -1784,15 +1722,11 @@ def gaps( new_seqnames = [x[0] for x in splits] new_strand = np.asarray([int(x[1]) for x in splits]) - output = GenomicRanges( - seqnames=new_seqnames, strand=new_strand, ranges=all_merged_ranges - ) + output = GenomicRanges(seqnames=new_seqnames, strand=new_strand, ranges=all_merged_ranges) return output - def disjoin( - self, with_reverse_map: bool = False, ignore_strand: bool = False - ) -> "GenomicRanges": + def disjoin(self, with_reverse_map: bool = False, ignore_strand: bool = False) -> "GenomicRanges": """Calculate disjoint genomic positions for each distinct (seqname, strand) pair. Args: @@ -1834,18 +1768,14 @@ def disjoin( new_seqnames = [x[0] for x in splits] new_strand = np.asarray([int(x[1]) for x in splits]) - output = GenomicRanges( - seqnames=new_seqnames, strand=new_strand, ranges=all_merged_ranges - ) + output = GenomicRanges(seqnames=new_seqnames, strand=new_strand, ranges=all_merged_ranges) if with_reverse_map is True: output._mcols.set_column("revmap", rev_map, in_place=True) return output - def coverage( - self, shift: int = 0, width: Optional[int] = None, weight: int = 1 - ) -> Dict[str, np.ndarray]: + def coverage(self, shift: int = 0, width: Optional[int] = None, weight: int = 1) -> Dict[str, np.ndarray]: """Calculate coverage for each chromosome, For each position, counts the number of ranges that cover it. Args: @@ -1873,10 +1803,7 @@ def coverage( for chrm, group in chrm_grps.items(): _grp_subset = self[group] - all_intvals = [ - (x[0], x[1]) - for x in zip(_grp_subset._ranges._start, _grp_subset._ranges.end) - ] + all_intvals = [(x[0], x[1]) for x in zip(_grp_subset._ranges._start, _grp_subset._ranges.end)] cov, _ = create_np_vector(intervals=all_intvals, with_reverse_map=False) @@ -2006,23 +1933,15 @@ def intersect_ncls(self, other: "GenomicRanges") -> "GenomicRanges": other_end = other.end other_ncls = NCLS(other.start, other_end, np.arange(len(other))) - _self_indexes, _other_indexes = other_ncls.all_overlaps_both( - self.start, self_end, np.arange(len(self)) - ) + _self_indexes, _other_indexes = other_ncls.all_overlaps_both(self.start, self_end, np.arange(len(self))) - other_chrms = np.array( - [other._seqinfo._seqnames[other._seqnames[i]] for i in _other_indexes] - ) - self_chrms = np.array( - [self._seqinfo._seqnames[self._seqnames[i]] for i in _self_indexes] - ) + other_chrms = np.array([other._seqinfo._seqnames[other._seqnames[i]] for i in _other_indexes]) + self_chrms = np.array([self._seqinfo._seqnames[self._seqnames[i]] for i in _self_indexes]) other_strands = other._strand[_other_indexes] self_strands = self._strand[_self_indexes] - filtered_indexes = np.logical_and( - other_chrms == self_chrms, other_strands == self_strands - ) + filtered_indexes = np.logical_and(other_chrms == self_chrms, other_strands == self_strands) self_starts = self.start[_self_indexes][filtered_indexes] other_starts = other.start[_other_indexes][filtered_indexes] @@ -2100,9 +2019,7 @@ def find_overlaps( raise TypeError("'query' is not a `GenomicRanges` object.") if query_type not in OVERLAP_QUERY_TYPES: - raise ValueError( - f"'{query_type}' must be one of {', '.join(OVERLAP_QUERY_TYPES)}." - ) + raise ValueError(f"'{query_type}' must be one of {', '.join(OVERLAP_QUERY_TYPES)}.") rev_map = [[] for _ in range(len(query))] subject_chrm_grps = self._group_indices_by_chrm(ignore_strand=ignore_strand) @@ -2192,9 +2109,7 @@ def count_overlaps( raise TypeError("'query' is not a `GenomicRanges` object.") if query_type not in OVERLAP_QUERY_TYPES: - raise ValueError( - f"'{query_type}' must be one of {', '.join(OVERLAP_QUERY_TYPES)}." - ) + raise ValueError(f"'{query_type}' must be one of {', '.join(OVERLAP_QUERY_TYPES)}.") rev_map = [0 for _ in range(len(query))] subject_chrm_grps = self._group_indices_by_chrm(ignore_strand=ignore_strand) @@ -2284,9 +2199,7 @@ def subset_by_overlaps( raise TypeError("'query' is not a `GenomicRanges` object.") if query_type not in OVERLAP_QUERY_TYPES: - raise ValueError( - f"'{query_type}' must be one of {', '.join(OVERLAP_QUERY_TYPES)}." - ) + raise ValueError(f"'{query_type}' must be one of {', '.join(OVERLAP_QUERY_TYPES)}.") rev_map = [] subject_chrm_grps = self._group_indices_by_chrm(ignore_strand=ignore_strand) @@ -2387,9 +2300,7 @@ def nearest( _sub_subset = self[_subset] _query_subset = query[indices] - res_idx = _sub_subset._ranges.nearest( - query=_query_subset._ranges, select=select, delete_index=False - ) + res_idx = _sub_subset._ranges.nearest(query=_query_subset._ranges, select=select, delete_index=False) for j, val in enumerate(res_idx): _rev_map = [_subset[x] for x in val] @@ -2450,9 +2361,7 @@ def precede( _sub_subset = self[_subset] _query_subset = query[indices] - res_idx = _sub_subset._ranges.precede( - query=_query_subset._ranges, select=select, delete_index=False - ) + res_idx = _sub_subset._ranges.precede(query=_query_subset._ranges, select=select, delete_index=False) for j, val in enumerate(res_idx): _rev_map = [_subset[x] for x in val] @@ -2513,9 +2422,7 @@ def follow( _sub_subset = self[_subset] _query_subset = query[indices] - res_idx = _sub_subset._ranges.follow( - query=_query_subset._ranges, select=select, delete_index=False - ) + res_idx = _sub_subset._ranges.follow(query=_query_subset._ranges, select=select, delete_index=False) for j, val in enumerate(res_idx): _rev_map = [_subset[x] for x in val] @@ -2725,9 +2632,7 @@ def invert_strand(self, in_place: bool = False) -> "GenomicRanges": ######>> window methods <<###### ################################ - def tile_by_range( - self, n: Optional[int] = None, width: Optional[int] = None - ) -> "GenomicRanges": + def tile_by_range(self, n: Optional[int] = None, width: Optional[int] = None) -> "GenomicRanges": """Split each sequence length into chunks by ``n`` (number of intervals) or ``width`` (intervals with equal width). @@ -2768,22 +2673,16 @@ def tile_by_range( elif width is not None: twidth = width - all_intervals = split_intervals( - val._ranges._start[0], val._ranges.end[0] - 1, twidth - ) + all_intervals = split_intervals(val._ranges._start[0], val._ranges.end[0] - 1, twidth) seqnames.extend([val.get_seqnames()[0]] * len(all_intervals)) strand.extend([int(val.strand[0])] * len(all_intervals)) starts.extend([x[0] for x in all_intervals]) widths.extend(x[1] for x in all_intervals) - return GenomicRanges( - seqnames=seqnames, strand=strand, ranges=IRanges(start=starts, width=widths) - ) + return GenomicRanges(seqnames=seqnames, strand=strand, ranges=IRanges(start=starts, width=widths)) - def tile( - self, n: Optional[int] = None, width: Optional[int] = None - ) -> "GenomicRanges": + def tile(self, n: Optional[int] = None, width: Optional[int] = None) -> "GenomicRanges": """Split each interval by ``n`` (number of sub intervals) or ``width`` (intervals with equal width). Note: Either ``n`` or ``width`` must be provided but not both. @@ -2824,15 +2723,11 @@ def tile( twidth = math.ceil((val._ranges._width + 1) / (n)) if twidth < 1: - raise RuntimeError( - f"'width' of region is less than 'n' for range in: {counter}." - ) + raise RuntimeError(f"'width' of region is less than 'n' for range in: {counter}.") elif width is not None: twidth = width - all_intervals = split_intervals( - val._ranges._start[0], val._ranges.end[0] - 1, twidth - ) + all_intervals = split_intervals(val._ranges._start[0], val._ranges.end[0] - 1, twidth) seqnames.extend([val.get_seqnames()[0]] * len(all_intervals)) strand.extend([int(val.strand[0])] * len(all_intervals)) @@ -2841,9 +2736,7 @@ def tile( counter += 1 - return GenomicRanges( - seqnames=seqnames, strand=strand, ranges=IRanges(start=starts, width=widths) - ) + return GenomicRanges(seqnames=seqnames, strand=strand, ranges=IRanges(start=starts, width=widths)) def sliding_windows(self, width: int, step: int = 1) -> "GenomicRanges": """Slide along each range by ``width`` (intervals with equal ``width``) and ``step``. @@ -2881,9 +2774,7 @@ def sliding_windows(self, width: int, step: int = 1) -> "GenomicRanges": starts.extend([x[0] for x in all_intervals]) widths.extend(x[1] for x in all_intervals) - return GenomicRanges( - seqnames=seqnames, strand=strand, ranges=IRanges(start=starts, width=widths) - ) + return GenomicRanges(seqnames=seqnames, strand=strand, ranges=IRanges(start=starts, width=widths)) @classmethod def tile_genome( @@ -2953,9 +2844,7 @@ def tile_genome( starts.extend([x[0] for x in all_intervals]) widths.extend(x[1] for x in all_intervals) - return GenomicRanges( - seqnames=seqnames, strand=strand, ranges=IRanges(start=starts, width=widths) - ) + return GenomicRanges(seqnames=seqnames, strand=strand, ranges=IRanges(start=starts, width=widths)) def binned_average( self, @@ -3034,9 +2923,7 @@ def split(self, groups: list) -> "GenomicRangesList": """ if len(groups) != len(self): - raise ValueError( - "Number of groups must match the number of genomic elements." - ) + raise ValueError("Number of groups must match the number of genomic elements.") gdict = group_by_indices(groups=groups) @@ -3068,9 +2955,7 @@ def empty(cls): ######>> subtract <<###### ########################## - def subtract( - self, x: "GenomicRanges", min_overlap: int = 1, ignore_strand: bool = False - ) -> "GenomicRangesList": + def subtract(self, x: "GenomicRanges", min_overlap: int = 1, ignore_strand: bool = False) -> "GenomicRangesList": """Subtract searches for features in ``x`` that overlap ``self`` by at least the number of base pairs given by ``min_overlap``. @@ -3091,9 +2976,7 @@ def subtract( the subtracted regions. """ _x_reduce = x.reduce(ignore_strand=ignore_strand) - hits = self.find_overlaps( - _x_reduce, min_overlap=min_overlap, ignore_strand=ignore_strand - ) + hits = self.find_overlaps(_x_reduce, min_overlap=min_overlap, ignore_strand=ignore_strand) gr_idxs = [[] for _ in range(len(self))] for ii, ix in enumerate(hits): diff --git a/src/genomicranges/GenomicRangesList.py b/src/genomicranges/GenomicRangesList.py index e3b8828..831ba80 100644 --- a/src/genomicranges/GenomicRangesList.py +++ b/src/genomicranges/GenomicRangesList.py @@ -16,12 +16,8 @@ def _validate_ranges(ranges, num_ranges): if ranges is None: raise ValueError("'ranges' cannot be None.") - if not ( - isinstance(ranges, GenomicRanges) or ut.is_list_of_type(ranges, GenomicRanges) - ): - raise TypeError( - "`ranges` must be either a `GenomicRanges` or a list of `GenomicRanges`." - ) + if not (isinstance(ranges, GenomicRanges) or ut.is_list_of_type(ranges, GenomicRanges)): + raise TypeError("`ranges` must be either a `GenomicRanges` or a list of `GenomicRanges`.") if isinstance(ranges, list) and sum([len(x) for x in ranges]) != num_ranges: raise ValueError( @@ -40,15 +36,11 @@ def _validate_optional_attrs(mcols, names, num_ranges): raise TypeError("'mcols' is not a `BiocFrame` object.") if mcols.shape[0] != num_ranges: - raise ValueError( - "Length of 'mcols' does not match the number of genomic elements." - ) + raise ValueError("Length of 'mcols' does not match the number of genomic elements.") if names is not None: if len(names) != num_ranges: - raise ValueError( - "Length of 'names' does not match the number of genomic elements." - ) + raise ValueError("Length of 'names' does not match the number of genomic elements.") if any(x is None for x in names): raise ValueError("'names' cannot contain None values.") @@ -79,11 +71,7 @@ def __iter__(self): def __next__(self): if self._current_index < len(self._grl): - iter_row_index = ( - self._grl.names[self._current_index] - if self._grl.names is not None - else None - ) + iter_row_index = self._grl.names[self._current_index] if self._grl.names is not None else None iter_slice = self._grl[self._current_index] self._current_index += 1 @@ -109,20 +97,84 @@ class GenomicRangesList: .. code-block:: python a = GenomicRanges( - seqnames=["chr1", "chr2", "chr1", "chr3"], - ranges=IRanges([1, 3, 2, 4], [10, 30, 50, 60]), - strand=["-", "+", "*", "+"], - mcols=BiocFrame({"score": [1, 2, 3, 4]}), + seqnames=[ + "chr1", + "chr2", + "chr1", + "chr3", + ], + ranges=IRanges( + [ + 1, + 3, + 2, + 4, + ], + [ + 10, + 30, + 50, + 60, + ], + ), + strand=[ + "-", + "+", + "*", + "+", + ], + mcols=BiocFrame( + { + "score": [ + 1, + 2, + 3, + 4, + ] + } + ), ) b = GenomicRanges( - seqnames=["chr2", "chr4", "chr5"], - ranges=IRanges([3, 6, 4], [30, 50, 60]), - strand=["-", "+", "*"], - mcols=BiocFrame({"score": [2, 3, 4]}), + seqnames=[ + "chr2", + "chr4", + "chr5", + ], + ranges=IRanges( + [3, 6, 4], + [ + 30, + 50, + 60, + ], + ), + strand=[ + "-", + "+", + "*", + ], + mcols=BiocFrame( + { + "score": [ + 2, + 3, + 4, + ] + } + ), ) - grl = GenomicRangesList(ranges=[gr1, gr2], names=["gene1", "gene2"]) + grl = GenomicRangesList( + ranges=[ + gr1, + gr2, + ], + names=[ + "gene1", + "gene2", + ], + ) Additionally, you may also provide metadata about the genomic elements in the dictionary using mcols attribute. @@ -280,9 +332,7 @@ def __str__(self) -> str: Returns: A pretty-printed string containing the contents of this ``GenomicRangesList``. """ - output = ( - f"GenomicRangesList with {len(self)} range{'s' if len(self) != 1 else ''}" - ) + output = f"GenomicRangesList with {len(self)} range{'s' if len(self) != 1 else ''}" output += f" and {len(self._mcols.get_column_names())} metadata column{'s' if len(self._mcols.get_column_names()) != 1 else ''}\n" if isinstance(self._ranges, GenomicRanges) and len(self._ranges) == 0: @@ -355,9 +405,7 @@ def get_ranges(self) -> Union[GenomicRanges, List[GenomicRanges]]: return self._ranges - def set_ranges( - self, ranges: Union[GenomicRanges, List[GenomicRanges]], in_place: bool = False - ) -> "GenomicRanges": + def set_ranges(self, ranges: Union[GenomicRanges, List[GenomicRanges]], in_place: bool = False) -> "GenomicRanges": """Set new genomic ranges. Args: @@ -543,9 +591,7 @@ def set_metadata(self, metadata: dict, in_place: bool = False) -> "GenomicRanges or as a reference to the (in-place-modified) original. """ if not isinstance(metadata, dict): - raise TypeError( - f"`metadata` must be a dictionary, provided {type(metadata)}." - ) + raise TypeError(f"`metadata` must be a dictionary, provided {type(metadata)}.") output = self._define_output(in_place) output._metadata = metadata @@ -587,9 +633,7 @@ def groups(self, group: Union[str, int]) -> "GenomicRangesList": group = self._names.map(group) if group < 0 or group > len(self): - raise ValueError( - "'group' must be less than the number of genomic elements." - ) + raise ValueError("'group' must be less than the number of genomic elements.") return self[group] @@ -768,9 +812,7 @@ def to_pandas(self) -> "pandas.DataFrame": ######>> slicers <<######### ############################ - def __getitem__( - self, args: Union[str, int, tuple, list, slice] - ) -> Union[GenomicRanges, "GenomicRangesList"]: + def __getitem__(self, args: Union[str, int, tuple, list, slice]) -> Union[GenomicRanges, "GenomicRangesList"]: """Subset individual genomic elements. Args: @@ -808,9 +850,7 @@ def __getitem__( if isinstance(idx, list): if ut.is_list_of_type(idx, bool): if len(idx) != len(self): - raise ValueError( - "`indices` is a boolean vector, length should match the size of the data." - ) + raise ValueError("`indices` is a boolean vector, length should match the size of the data.") idx = [i for i in range(len(idx)) if idx[i] is True] @@ -825,9 +865,7 @@ def __getitem__( if self.mcols is not None: new_mcols = self.mcols[idx, :] - return GenomicRangesList( - new_ranges, new_range_lengths, new_names, new_mcols, self._metadata - ) + return GenomicRangesList(new_ranges, new_range_lengths, new_names, new_mcols, self._metadata) elif isinstance(idx, (slice, range)): if isinstance(idx, range): idx = slice(idx.start, idx.stop, idx.step) diff --git a/src/genomicranges/SeqInfo.py b/src/genomicranges/SeqInfo.py index 06e8be7..0b1c19f 100644 --- a/src/genomicranges/SeqInfo.py +++ b/src/genomicranges/SeqInfo.py @@ -320,9 +320,7 @@ def get_seqnames(self) -> List[str]: """ return self._seqnames - def set_seqnames( - self, seqnames: Sequence[str], in_place: bool = False - ) -> "SeqInfo": + def set_seqnames(self, seqnames: Sequence[str], in_place: bool = False) -> "SeqInfo": """ Args: seqnames: @@ -409,9 +407,7 @@ def seqlengths(self) -> List[int]: return self.get_seqlengths() @seqlengths.setter - def seqlengths( - self, seqlengths: Optional[Union[int, Sequence[int], Dict[str, int]]] - ): + def seqlengths(self, seqlengths: Optional[Union[int, Sequence[int], Dict[str, int]]]): warn( "Setting property 'seqlengths' is an in-place operation, use 'set_seqlengths' instead", UserWarning, @@ -471,9 +467,7 @@ def is_circular(self) -> List[bool]: return self.get_is_circular() @is_circular.setter - def is_circular( - self, is_circular: Optional[Union[bool, Sequence[bool], Dict[str, bool]]] - ): + def is_circular(self, is_circular: Optional[Union[bool, Sequence[bool], Dict[str, bool]]]): warn( "Setting property 'is_circular' is an in-place operation, use 'set_is_circular' instead", UserWarning, diff --git a/src/genomicranges/io/gtf.py b/src/genomicranges/io/gtf.py index 541d4f4..41f20c0 100644 --- a/src/genomicranges/io/gtf.py +++ b/src/genomicranges/io/gtf.py @@ -97,9 +97,7 @@ def parse_gtf( comment=comment, ) - rows = Parallel(n_jobs=-2)( - delayed(_parse_all_attribute)(row) for _, row in df.iterrows() - ) + rows = Parallel(n_jobs=-2)(delayed(_parse_all_attribute)(row) for _, row in df.iterrows()) gtf = DataFrame.from_records(rows) gtf.drop(["group"], axis=1) diff --git a/src/genomicranges/io/ucsc.py b/src/genomicranges/io/ucsc.py index 929405d..7e03a48 100644 --- a/src/genomicranges/io/ucsc.py +++ b/src/genomicranges/io/ucsc.py @@ -31,9 +31,7 @@ def access_gtf_ucsc( base_path = f"http://hgdownload.cse.ucsc.edu/goldenPath/{genome}/bigZips/genes/" if type not in ["refGene", "ensGene", "knownGene", "ncbiRefSeq"]: - raise ValueError( - f"type must be one of refGene, ensGene, knownGene or ncbiRefSeq, provided {type}" - ) + raise ValueError(f"type must be one of refGene, ensGene, knownGene or ncbiRefSeq, provided {type}") full_path = f"{base_path}/{genome}.{type}.gtf.gz" diff --git a/src/genomicranges/utils.py b/src/genomicranges/utils.py index b09dd94..fa5fda0 100644 --- a/src/genomicranges/utils.py +++ b/src/genomicranges/utils.py @@ -12,9 +12,7 @@ REV_STRAND_MAP = {"1": "+", "-1": "-", "0": "*"} -def sanitize_strand_vector( - strand: Union[Sequence[str], Sequence[int], np.ndarray] -) -> np.ndarray: +def sanitize_strand_vector(strand: Union[Sequence[str], Sequence[int], np.ndarray]) -> np.ndarray: """Create a numpy representation for ``strand``. Mapping: 1 for "+" (forward strand), 0 for "*" (any strand) and -1 for "-" (reverse strand). @@ -54,9 +52,7 @@ def sanitize_strand_vector( ) return np.asarray(strand, dtype=np.int8) else: - raise TypeError( - "'strand' must be either a numpy vector, a list of integers or strings representing strand." - ) + raise TypeError("'strand' must be either a numpy vector, a list of integers or strings representing strand.") def _sanitize_vec(x: Sequence): @@ -210,9 +206,4 @@ def create_np_vector( def group_by_indices(groups: list) -> dict: - return { - k: [x[0] for x in v] - for k, v in groupby( - sorted(enumerate(groups), key=lambda x: x[1]), lambda x: x[1] - ) - } + return {k: [x[0] for x in v] for k, v in groupby(sorted(enumerate(groups), key=lambda x: x[1]), lambda x: x[1])}