-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdistancemat.c
483 lines (409 loc) · 15.9 KB
/
distancemat.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
/**********************************************************************
* File: distancemat.c
* Author: Kevin Howe
* Copyright (C) Genome Research Limited, 2002-
*-------------------------------------------------------------------
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------
* NOTES:
* Functions and types for the manipulation of Distance Matrices
**********************************************************************/
#include "njheaders/distancemat.h"
/*********************** static variables *****************************/
/* This is a table of estimated PAMs for a range of percentage-differences,
ranging from 75% dissimilarity, going up in 0.1% steps, to 93%
dissimilarity. For percentage dissimilarty outside this range, we either
use Kimura's formula (for < 75) or give an arbitrarily high distance
(for > 93) */
static int dayhoff_pams[]={
195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */
196, /* 75.1% observed d; 196 PAMs estimated */
197, 198, 199, 200, 200, 201, 202, 203,
204, 205, 206, 207, 208, 209, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
223, 224, 226, 227, 228, 229, 230, 231, 232, 233,
234, 236, 237, 238, 239, 240, 241, 243, 244, 245,
246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */
252, 253, 254, 255, 257, 258,
260, 261, 262, 264, 265, 267, 268, 270, 271, 273,
274, 276, 277, 279, 281, 282, 284, 285, 287, 289,
291, 292, 294, 296, 298, 299, 301, 303, 305, 307,
309, 311, 313, 315, 317, 319, 321, 323, 325, 328,
330, 332, 335, 337, 339, 342, 344, 347, 349, 352,
354, 357, 360, 362, 365, 368, 371, 374, 377, 380,
383, 386, 389, 393, 396, 399, 403, 407, 410, 414,
418, 422, 426, 430, 434, 438, 442, 447, 451, 456,
461, 466, 471, 476, 482, 487, 493, 498, 504, 511,
517, 524, 531, 538, 545, 553, 560, 569, 577, 586,
595, 605, 615, 626, 637, 649, 661, 675, 688, 703,
719, 736, 754, 775, 796, 819, 845, 874, 907, 945,
/* 92.9% observed; 945 PAMs */
988 /* 93.0% observed; 988 PAMs */
};
/*********************************************************************
FUNCTION: calc_DistanceMatrix
DESCRIPTION:
Produces a distance matrix from the given multiple alignment
RETURNS: struct DistanceMatrix
ARGS:
A DistanceMatrix to fill in
A multiple alignment
A boolean indicating whether or not random columns should be used
for purposes of bootstrapping
A boolean indicating whether the Kimura distance adjustment is to
be used or not.
NOTES:
0. the given DistanceMatrix and Alignment should be of the same order
1. The matrix produced is in bottom-left triangular format; don't you
go trying to access that top-right section (I'm warning you...)
2. At the moment, the function calculates distance based on sequence
identity, using Kimura's function if that option is raised.
3. If use_rand_cols is true, then the matrix is constructed using
random sampling of columns, for the purposes of bootstrapping. At
the moment, the native function 'rand' is used to do this, suitable
seeded by time (by the caller). This may prove unsatisfactory...
4. Where no information is available to determine the distance
between two sequences, a value of twice the maximum observed
distance is assigned (inspiration from ISMB99 poster by Huson,
Smith and Warnow).
*********************************************************************/
void calc_DistanceMatrix( struct DistanceMatrix *mat,
struct Alignment *aln,
unsigned int use_rand_cols,
unsigned int use_kimura ) {
/* this function will take alignment and return a distance matrix */
/* This gives a clear separation between tree making and distances */
unsigned int i, j, k, table_index, num_undefined_distances, mem_increment;
unsigned int *columnlist;
Distance residuecount, distance, max_observed_distance;
Distance **undefined_distances;
columnlist = (unsigned int *) malloc_util( aln->length * sizeof(unsigned int));
for (i=0; i < aln->length; i++) {
if (use_rand_cols) {
/* generate random column here */
columnlist[i] = (unsigned int) rand() % aln->length;
}
else {
columnlist[i] = i;
}
}
max_observed_distance = 0.0;
mem_increment = 10;
undefined_distances = NULL;
num_undefined_distances = 0;
for( i=0; i < aln->numseqs; i++) {
mat->data[i][i] = 0.0;
for( j=0; j < i; j++ ) {
residuecount = distance = 0.0;
mat->data[i][j] = 0.0;
for( k=0; k < aln->length; k++) {
if ( aln->seqs[i]->seq[columnlist[k]] == '.' ||
aln->seqs[j]->seq[columnlist[k]] == '.' ||
aln->seqs[i]->seq[columnlist[k]] == '-' ||
aln->seqs[j]->seq[columnlist[k]] == '-' ||
aln->seqs[i]->seq[columnlist[k]] == ' ' ||
aln->seqs[j]->seq[columnlist[k]] == ' ')
continue;
/* neither character is a gap, so proceed */
residuecount += 1.0;
if ( aln->seqs[i]->seq[columnlist[k]] != aln->seqs[j]->seq[columnlist[k]])
distance += 1.0;
}
/* if residue count was zero here, there must have been a gap in every position;
in this case, %identity is undefined so a decision must be made to consider the
sequences 100% identical or 100% different. I go with the standard approach of
assigning twoce the maximum observed distance */
if (residuecount > 0) {
distance = distance / residuecount;
}
else {
distance = -1.0;
if (num_undefined_distances % mem_increment == 0) {
if (num_undefined_distances == 0) {
undefined_distances = (Distance **)
malloc_util( sizeof( Distance *) * mem_increment);
}
else {
undefined_distances = (Distance **)
realloc_util( undefined_distances,
(num_undefined_distances + mem_increment) * sizeof(Distance *));
}
}
undefined_distances[num_undefined_distances++] = &(mat->data[i][j]);
/* distances should be zero, but may be slightly less due to floating point
arithmetic */
}
/* Use Kimura's formula to convert percentage dissimilarity to a distance */
if (use_kimura) {
if ( distance < 0.75) {
if (distance > 0.0)
distance = - log( 1.0 - distance - (distance * distance * 0.20) );
}
else {
if (distance > 0.930) {
distance = 10.0;
}
else {
table_index = (int) ((distance*1000.0) - 750.0);
distance = (Distance) dayhoff_pams[ table_index ];
distance /= 100.0;
}
}
}
if (distance > max_observed_distance) {
max_observed_distance = distance;
}
mat->data[i][j] = distance;
}
}
/* before we go, lets find those undefined distances and replace them with
twice the maximum observed distance */
for (i=0; i < num_undefined_distances; i++ ) {
*undefined_distances[i] = 2 * max_observed_distance;
}
columnlist = free_util( columnlist );
if (undefined_distances != NULL) {
undefined_distances = free_util( undefined_distances );
}
}
/*********************************************************************
FUNCTION: clone_DistanceMatrix
DESCRIPTION:
Produces a brand new DistanceMatrix, identical to the source
RETURNS: struct DistanceMatrix
ARGS:
A source distane matrix
NOTES:
1. The matrix produced is in bottom-left triangular format; don't you
go trying to access that top-right section (I'm warning you...)
*********************************************************************/
struct DistanceMatrix *clone_DistanceMatrix( struct DistanceMatrix *source) {
unsigned int i,j;
struct DistanceMatrix *dest;
if (source != NULL) {
dest = empty_DistanceMatrix( source->size );
for( i=0; i < dest->size; i++) {
for( j=0; j <= i; j++ ) {
dest->data[i][j] = source->data[i][j];
}
}
}
else {
dest = NULL;
}
return dest;
}
/*********************************************************************
FUNCTION: empty_DistanceMatrix
DESCRIPTION:
Produces an empty distance matrixof the given size, uninitialised
RETURNS: struct DistanceMatrix
ARGS:
The size of the matrix to be created
NOTES:
1. The matrix produced is in bottom-left triangular format; don't you
go trying to access that top-right section (I'm warning you...)
*********************************************************************/
struct DistanceMatrix *empty_DistanceMatrix( unsigned int size) {
unsigned int i,j;
struct DistanceMatrix *mat;
mat = (struct DistanceMatrix *) malloc_util(sizeof(struct DistanceMatrix));
mat->size = size;
mat->data = (Distance **) malloc_util( mat->size * sizeof(Distance *) );
for( i=0; i < mat->size; i++) {
mat->data[i] = (Distance *) malloc_util( (i+1) * sizeof(Distance) );
for( j=0; j <= i; j++)
mat->data[i][j] = 0.0;
}
return mat;
}
/*********************************************************************
FUNCTION: free_DistanceMatrix
DESCRIPTION:
Frees the memory for the given distance matrix
RETURNS:
ARGS:
struct DistanceMatrix *
NOTES:
*********************************************************************/
void *free_DistanceMatrix( struct DistanceMatrix *mat ) {
int i;
if ( mat != NULL ) {
if (mat->data != NULL) {
for( i=0; i < mat->size; i++ ) {
if (mat->data[i] != NULL)
mat->data[i] = free_util( mat->data[i] );
}
mat->data = free_util( mat->data );
}
mat = free_util( mat );
}
return mat;
}
/**********************************************************************
FUNCTION: index_DistanceMatrix
DESCRIPTION:
indexes the given distance matrix with the given indices,
returning the appropraite distance.
RETURNS: distance (float)
ARGS:
A distance matrix *
row index
column index
NOTES:
This function is necessary to account for the fact that the distance
matrix may be implemented as a symmtrical or triangular matrix.
It therefore abstracts the internals of the distance matrix, at the
cost of a function call for each lookup (is this wise...?)
**********************************************************************/
Distance index_DistanceMatrix( struct DistanceMatrix *mat,
unsigned int i,
unsigned int j) {
if (i > j)
return mat->data[i][j];
else
return mat->data[j][i];
}
/*********************************************************************
FUNCTION: print_DistanceMatrix
DESCRIPTION:
Prints the given distance matrix.
RETURNS:
ARGS:
struct DistanceMatrix *
NOTES:
A DistanceMatrix does not exist in isolation in practice but as
part of a Cluster (this is to maintain the tight coupling between
the matrix and the sequences for which it is expressing the distances).
Therefore, to read or write a useful distance
matrix (for compatibility with the phylip package for example)
use write_phylip_Cluster
*********************************************************************/
void print_DistanceMatrix( FILE *handle, struct DistanceMatrix *mat ) {
unsigned int row, column;
fprintf( handle, "Size:%d\n", mat->size);
for(row=0; row < mat->size; row++) {
fprintf( handle, "%5d", row);
for(column=0; column <= row; column++)
fprintf( handle, "%10.5f", mat->data[row][column]);
fprintf( handle, "\n");
}
fflush( handle );
}
/*********************************************************************
FUNCTION: read_phylip_DistanceMatrix
DESCRIPTION:
This function creates a DistanceMatrix from the given input file.
It also crates a dummy alignment (sequences with just names) and
puts it in the given Alignment pointer
RETURNS: struct Cluster *
ARGS:
A file handle
A pointer to an Alignment pointer
NOTES:
The file is assumed to be the distance matrix file format used
by the phlip package:
4
Name_1 0.0000 0.6776 0.6786 0.2342
Name_2 0.6776 0.0000 0.1111 0.9999
Name_3 0.6786 0.1111 0.0000 0.4444
Name_4 0.2342 0.9999 0.4444 0.0000
*********************************************************************/
struct DistanceMatrix *read_phylip_DistanceMatrix( FILE *handle, struct Alignment **aln_loc) {
struct DistanceMatrix *mat;
unsigned int size, i, j;
char identifier[MAX_PHYLIP_NAME_LEN + 1];
double dist;
/* The size of the matrix will be on the first line on its own */
if (! fscanf( handle, "%d", &size ))
fatal_util( "Parse error: The first line should contain the size of matrix");
*aln_loc = (struct Alignment *) malloc_util( sizeof(struct Alignment ));
(*aln_loc)->numseqs = size;
(*aln_loc)->seqs = (struct Sequence **)
malloc_util( size * sizeof( struct Sequence *));
(*aln_loc)->length = 0;
mat = empty_DistanceMatrix( size );
//print_DistanceMatrix(stdout,mat);
for (i=0; i < size; i++) {
/* In PHYLIP format, the name is exactly 10 characters. However, we allow up to
MAX_PHYLIP_NAME_LEN */
unsigned int cidx;
char c;
unsigned int newline = 0;
if (! fscanf( handle, " %c", identifier))
fatal_util( "Parse error: failed to read the matrix name");
cidx = 1;
while(c = fgetc(handle)) {
if (c == '\n')
newline = 1;
if (isspace(c))
break;
if (cidx < MAX_PHYLIP_NAME_LEN)
identifier[cidx++] = c;
}
identifier[cidx] = '\0';
/* In PHYLIP format, the distance matrix is symmetrical. However, also
cope with a bottom-left matrix */
(*aln_loc)->seqs[i] = empty_Sequence();
(*aln_loc)->seqs[i]->name = (char *) malloc_util( (strlen(identifier) + 1) * sizeof(char));
strcpy( (*aln_loc)->seqs[i]->name, identifier );
for (j=0; j <= i; j++) {
if (! fscanf( handle, "%lf", &dist))
fatal_util("Parse error whilst looking for distance");
mat->data[i][j] = (Distance) dist;
}
if (!newline) {
do {
c = fgetc(handle);
} while(c != '\n');
}
}
//print_DistanceMatrix(stdout,mat);
return mat;
}
/*********************************************************************
FUNCTION: write_phylip_DistanceMatrix
DESCRIPTION:
This function takes the given DistanceMatrix and writes it to the
given file handle in phylip format. The alignment is needed for the
Sequence names
format
RETURNS:
ARGS:
A file handle
A DistanceMatrix pointer (cluster.h)
An Alignment pointer
NOTES:
The file is written in the distance matrix file format used
by the phlip package:
4
Name_1 0.0000 0.6776 0.6786 0.2342
Name_1 0.6776 0.0000 0.1111 0.9999
Name_1 0.6786 0.1111 0.0000 0.4444
Name_1 0.2342 0.9999 0.4444 0.0000
*********************************************************************/
void write_phylip_DistanceMatrix( FILE *handle,
struct DistanceMatrix *mat,
struct Alignment *align) {
unsigned int row, column;
fprintf( handle, "\t%d\n", align->numseqs);
for(row=0; row < align->numseqs; row++) {
fprintf( handle, "%10.10s", align->seqs[row]->name);
for(column=0; column < align->numseqs; column++) {
if (row > column )
fprintf( handle, "%10.5f", mat->data[row][column]);
else
fprintf( handle, "%10.5f", mat->data[column][row]);
}
fprintf( handle, "\n");
}
fflush( handle );
}