4
4
import sys
5
5
import traceback
6
6
import collections
7
- import time
8
- from enum import Enum
9
- from utilities import *
10
7
import logging
11
8
9
+ if sys .version_info < (3 ,0 ):
10
+ print ("Sorry, requires Python 3.x, not Python 2.x" )
11
+ sys .exit (1 )
12
+ sys .path .append (os .path .realpath (os .path .dirname (sys .argv [0 ])) + '.utilities' )
13
+ from utilities import *
14
+
12
15
#ignore some file extensions
13
16
def remove_irrelevant_files (listOfFiles ):
14
17
return [file for file in listOfFiles if not ( file .endswith ('.log' )
@@ -24,7 +27,7 @@ def remove_irrelevant_files_from_dcmp(dcmp, filter_function=remove_irrelevant_fi
24
27
dcmp .right_only = filter_function (dcmp .right_only )
25
28
dcmp .diff_files = filter_function (dcmp .diff_files )
26
29
dcmp .funny_files = filter_function (dcmp .funny_files )
27
- dcmp .common_files = filter_function (dcmp .common_files )
30
+ dcmp .common_files = filter_function (dcmp .common_files )
28
31
dcmp .common_funny = filter_function (dcmp .common_funny )
29
32
30
33
for sub_dcmp in dcmp .subdirs .values ():
@@ -57,7 +60,7 @@ def get_all_common_different_files(dcmp):
57
60
58
61
def get_hash_sum_of_lines (filename ):
59
62
"""this can be used to get a nearly unique identifier for the content of a file
60
- where order does not matter. Two files with identical linesin different order should have the same hash sum"""
63
+ where order does not matter. Two files with identical lines in different order should have the same hash sum"""
61
64
with open (filename ) as infile :
62
65
hash_sum = sum (hash (l ) for l in infile )
63
66
return hash_sum
@@ -72,7 +75,6 @@ def print_diff_files(dcmp):
72
75
print_diff_files (sub_dcmp )
73
76
74
77
def are_outputs_equal (parameters ):
75
- start_time = time .perf_counter ()
76
78
parser = argparse .ArgumentParser (description = 'Compare two DaySim output directories' )
77
79
parser .add_argument ('--outputs_reference' , help = 'The reference saved outputs from a successful run [default: %(default)s}' )
78
80
parser .add_argument ('--outputs_new' , help = 'Newly generated result to be compared to reference [default: %(default)s}' )
@@ -95,109 +97,97 @@ def are_outputs_equal(parameters):
95
97
elif not os .path .isdir (args .outputs_new ):
96
98
raise Exception ('outputs_reference "' + args .outputs_reference + '" exists but not outputs_new "' + args .outputs_new + '"' )
97
99
98
-
100
+ print ( 'python ' + os . path . realpath ( __file__ ) + ' --outputs_reference "' + os . path . realpath ( args . outputs_reference ) + '" --outputs_new "' + os . path . realpath ( args . outputs_new ) + '"' )
99
101
dcmp = filecmp .dircmp (args .outputs_reference , args .outputs_new )
100
102
remove_irrelevant_files_from_dcmp (dcmp )
101
103
102
- #logging.debug('dcmp finished')
103
- #logging.debug('perf_time(): ' + str(time.perf_counter() - start_time))
104
-
105
104
are_all_files_common = are_all_files_common_func (dcmp )
106
- #logging.debug('are_all_files_common finished: ' + str(are_all_files_common))
107
- #logging.debug('perf_time(): ' + str(time.perf_counter() - start_time))
108
105
109
106
if not are_all_files_common :
110
107
result = False
111
108
print ("Folders do not have all of the same files so regression fails." )
109
+ dcmp .report_full_closure ()
112
110
else :
113
111
all_common_different_files = get_all_common_different_files (dcmp )
114
- result = len ( all_common_different_files ) == 0 #result is good if all common files are the same
112
+ result = True #this will be changed to false if any individual file is different in an important way (other than order)
115
113
logging .debug ('There are #' + str (len (all_common_different_files )) + ' files which are not binary identical. Will look more deeply.' )
116
- #logging.debug('perf_time(): ' + str(time.perf_counter() - start_time))
117
114
115
+ actuallyDifferentFiles = []
118
116
for different_file in all_common_different_files :
119
- result = False #since files are different assume failure unless changed again
120
117
#some DaySim files are identical in content but are output in a different line order
121
118
reference_file = os .path .join (args .outputs_reference , different_file )
122
119
assert os .path .isfile (reference_file ), "reference_file is not a file: " + reference_file
123
120
filename , file_extension = os .path .splitext (reference_file )
124
121
allow_text_comparison = file_extension in ['.csv' ,'.dat' ,'.tsv' ,'.txt' ]
125
122
new_file = os .path .join (args .outputs_new , different_file )
126
123
assert os .path .isfile (reference_file ), "new_file is not a file: " + new_file
127
- if os .path .getsize (reference_file ) != os .path .getsize (new_file ):
128
- logging .debug ('length of common file: ' + different_file + ' differs so difference must be more than different sort order!' )
124
+ #could check file size here with os.path.getsize is concerned about speed but don't bother because want to give more detailed diff if possible
125
+ filesAreDifferent = not allow_text_comparison
126
+ if filesAreDifferent :
127
+ print ('Files are different: "' + different_file + '" but do not know how to examine this type of file line by line so must assume different in a significant way!' )
129
128
else :
130
- logging .debug ('Common_file that is binary different at least has same file size so, if suitable text file, will check to see if same contents in different order. File: ' + different_file )
131
- if allow_text_comparison :
132
- #since same size need to check if same lines but in different order
133
-
134
- #quickest and least memory method is to sum the hash of each line and then compare
135
- hash_sum_reference = get_hash_sum_of_lines (reference_file )
136
- #logging.debug('hash_sum of reference: ' + str(hash_sum_reference))
137
- #logging.debug('perf_time(): ' + str(time.perf_counter() - start_time))
138
- hash_sum_new_file = get_hash_sum_of_lines (new_file )
139
- #logging.debug('hash_sum of new file: ' + str(hash_sum_new_file))
140
- #logging.debug('perf_time(): ' + str(time.perf_counter() - start_time))
141
-
142
- if hash_sum_reference == hash_sum_new_file :
143
- print ('File "' + different_file + '" has identical content just in different order.' )
144
- result = True #files count as same despite different order
145
- #else files are different in more than just sort order!
146
-
147
- if result == False :
148
- if not allow_text_comparison :
149
- logging .debug ('Files are different but unhandled extension "' + file_extension + '" so cannot check if differ only by line order. Therefore regression fails.' )
150
- else :
151
- logging .debug ('hash_sum of files is different so going to compare lines. reference_file "' + reference_file + '".' )
129
+ #quickest and least memory method is to sum the hash of each line and then compare
130
+ hash_sum_reference = get_hash_sum_of_lines (reference_file )
131
+ hash_sum_new_file = get_hash_sum_of_lines (new_file )
132
+
133
+ filesAreDifferent = hash_sum_reference != hash_sum_new_file
134
+ if not filesAreDifferent :
135
+ logging .debug ('File "' + different_file + '" has identical content just in different order.' )
136
+ else : #files are different in more than just sort order!
137
+ #print('hash_sum of files is different so going to compare lines. File "' + different_file + '".')
152
138
#if the files do not have identical lines get more detailed information of differences
139
+
153
140
with open (reference_file , encoding = 'latin-1' ) as infile :
141
+ reference_header = infile .readline ()
154
142
counts = collections .Counter (l for l in infile )
155
143
156
144
logging .debug ('Finished counting lines in reference folder copy of "' + different_file + '". There are '
157
145
+ str (len (counts )) + ' distinct lines' )
158
- #logging.debug('perf_time(): ' + str(time.perf_counter() - start_time))
159
-
160
- #logging.debug('deep_getsizeof(counts): ' + human_readable_bytes(deep_getsizeof(counts, set())))
161
- #logging.debug('perf_time(): ' + str(time.perf_counter() - start_time))
162
146
163
147
with open (new_file , encoding = 'latin-1' ) as infile :
148
+ new_header = infile .readline ()
164
149
counts .subtract (l for l in infile )
165
150
logging .debug ('Finished checking new version of "' + different_file + '".' )
166
- #logging.debug('perf_time(): ' + str(time.perf_counter() - start_time))
167
-
168
- missing_from_reference = []
169
- missing_from_new = []
170
- for line , count in counts .items ():
171
- if count < 0 :
172
- missing_from_reference .append ((line ,count ))
173
- elif count > 0 :
174
- missing_from_new .append ((line ,count ))
175
-
176
- assert len (missing_from_reference ) != 0 or len (missing_from_new ) != 0 , "hash_sum was different but the counts of each distinct are identical!"
177
-
178
- print ('File "' + different_file + '" with ' + str (len (counts )) + ' distinct lines has '
179
- + str (len (missing_from_new )) + ' distinct lines that were not found in the new and '
180
- + str (len (missing_from_reference )) + ' distinct lines that were not found in the reference file' )
181
-
182
- def print_line_and_counts_to_string (identifier , counted_strings ):
183
- #sort the missing lines so that the ones shown in reference and new will likely be similar which will make differences easier to spot
184
- counted_strings .sort (key = lambda line_count_tuple : line_count_tuple [0 ])
185
- if len (counted_strings ) > 0 :
186
- message = ('All ' if len (counted_strings ) <= args .max_different_lines_to_show else (' Sample ' + str (args .max_different_lines_to_show ))) + ' lines that are ' + identifier + '.\n '
187
- message += '\n ' .join (str (abs (count )) + ': ' + str (line ) for line , count in counted_strings [:args .max_different_lines_to_show ])
188
- print (message )
189
-
190
- print_line_and_counts_to_string ('missing from new file' , missing_from_new )
191
- print_line_and_counts_to_string ('missing from reference' , missing_from_reference )
192
-
193
- #logging.debug('perf_time(): ' + str(time.perf_counter() - start_time))
194
- #STOP!
195
- break
196
151
152
+ if reference_header != new_header :
153
+ print ('File headers are different!\n ref: ' + reference_header + '\n new: ' + new_header )
154
+ else :
155
+ missing_from_reference = []
156
+ missing_from_new = []
157
+ for line , count in counts .items ():
158
+ if count < 0 :
159
+ missing_from_reference .append ((line ,count ))
160
+ elif count > 0 :
161
+ missing_from_new .append ((line ,count ))
162
+
163
+ assert len (missing_from_reference ) != 0 or len (missing_from_new ) != 0 , "hash_sum was different but the counts of each distinct are identical!"
164
+
165
+ print ('File "' + different_file + '" with ' + str (len (counts )) + ' distinct lines has '
166
+ + str (len (missing_from_new )) + ' distinct lines that were not found in the new and '
167
+ + str (len (missing_from_reference )) + ' distinct lines that were not found in the reference file' )
168
+
169
+ #sort list and only keep top few lines
170
+ missing_from_reference .sort (key = lambda line_count_tuple : line_count_tuple [0 ])
171
+ missing_from_reference = missing_from_reference [:args .max_different_lines_to_show ]
172
+
173
+ missing_from_new .sort (key = lambda line_count_tuple : line_count_tuple [0 ])
174
+ missing_from_new = missing_from_new [:args .max_different_lines_to_show ]
175
+
176
+ print ('hdr: ' + reference_header .strip ('\n ' ))
177
+ for missing_line_index in range (0 , min (len (missing_from_reference ), len (missing_from_new ))):
178
+ print ('ref: ' + missing_from_reference [missing_line_index ][0 ].strip ('\n ' ) + '\t missing count: ' + str (abs (missing_from_reference [missing_line_index ][1 ])))
179
+ print ('new: ' + missing_from_new [missing_line_index ][0 ].strip ('\n ' ) + '\t missing count: ' + str (abs (missing_from_new [missing_line_index ][1 ])))
180
+ print ('------' )
181
+ if filesAreDifferent :
182
+ actuallyDifferentFiles .append (different_file )
183
+ result = result and not filesAreDifferent
184
+ #print('Is "' + different_file + '" actually different?: ' + str(filesAreDifferent) + '. Is regression still passing?: ' + str(result))
185
+
186
+ print ('There were ' + str (len (all_common_different_files )) + ' that were binary different. Of those, ' + str (len (actuallyDifferentFiles )) + ' files differed in ways that mattered: ' + str (actuallyDifferentFiles ))
197
187
if result :
198
- print ('Tests passed. Number of order different files: ' + str ( len ( all_common_different_files )) )
188
+ print ('PASSED! :-)' )
199
189
else :
200
- dcmp . report_full_closure ( )
190
+ print ( 'FAILED! :-(' )
201
191
return result
202
192
203
193
if __name__ == "__main__" :
0 commit comments