emDNA-GPU/basic.py at main · ryanDing26/emDNA-GPU · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import subprocess

def get_all_136_tetramers():
    """Generate and return all 136 unique tetrameric contexts."""
    def reverse_complement(seq):
        complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
        return ''.join(complement[base] for base in reversed(seq))

    bases = ['A', 'T', 'G', 'C']
    unique_tetramers = set()
    seen_structures = set()

    for b1 in bases:
        for b2 in bases:
            for b3 in bases:
                for b4 in bases:
                    tetramer = b1 + b2 + b3 + b4
                    rev_comp = reverse_complement(tetramer)
                    canonical = min(tetramer, rev_comp)

                    if canonical not in seen_structures:
                        seen_structures.add(canonical)
                        unique_tetramers.add(tetramer)

    return sorted(unique_tetramers)

def create_tetramer_test_sequence(target_tetramer, length, repeat_count=10):
    """
    Create a DNA sequence enriched with the target tetramer.

    Args:
        target_tetramer: The 4-base sequence to test (e.g., 'ATAG')
        length: Total length in base pairs
        repeat_count: How many times to insert the tetramer
    """
    # Start with a neutral background sequence (alternating GC is common)
    # or use the complement of your tetramer to avoid creating unintended contexts
    background = 'GC' * (length // 2)
    seq_list = list(background[:length])

    # Calculate spacing to distribute tetramers evenly
    spacing = length // (repeat_count + 1)

    # Insert target tetramer at regular intervals
    for i in range(repeat_count):
        pos = (i + 1) * spacing
        if pos + 4 <= length:
            seq_list[pos:pos+4] = list(target_tetramer)

    # Ensure circularity doesn't create unwanted tetramer at junction
    final_seq = ''.join(seq_list)

    return final_seq


def run_emdna_optimization(test_case, model='tetrameric', output_dir='results'):
    """
    Run emDNA optimization for a single test case.

    Args:
        test_case: Dict with sequence, linking_number, etc.
        model: 'tetrameric' or 'dimeric' (Cohen2017)
        output_dir: Where to save results
    """
    # Create input file for emDNA
    input_file = f"{output_dir}/{test_case['name']}_{model}.inp"
    output_file = f"{output_dir}/{test_case['name']}_{model}.out"

    # Write sequence and parameters to input file
    # (Format depends on emDNA's input specification)
    with open(input_file, 'w') as f:
        f.write(f"SEQUENCE {test_case['sequence']}\n")
        f.write(f"LENGTH {test_case['length']}\n")
        f.write(f"LINKING_NUMBER {test_case['linking_number']}\n")
        f.write(f"MODEL {model}\n")

    # Run emDNA
    cmd = f"emDNA -i {input_file} -o {output_file}"
    subprocess.run(cmd, shell=True, check=True)

    return output_file

def main():
    # Parameters matching the student's work
    lengths = range(80, 151)  # 80-150 bp
    linking_numbers_per_length = lambda L: [
        L/10.5 - 1,  # Negatively supercoiled
        L/10.5,      # Relaxed
        L/10.5 + 1   # Positively supercoiled
    ]

    # Generate all combinations
    test_cases = []
    for tetramer in get_all_136_tetramers():
        for length in lengths:
            seq = create_tetramer_test_sequence(tetramer, length)
            for lk in linking_numbers_per_length(length):
                test_cases.append({
                    'tetramer': tetramer,
                    'sequence': seq,
                    'length': length,
                    'linking_number': lk,
                    'name': f"{tetramer}_{length}bp_lk{lk:.1f}"
                })

    print(f"Total test cases: {len(test_cases)}")
    # 136 tetramers × 71 lengths × 3 linking numbers = 28,968 simulations