|
| 1 | +""" |
| 2 | +https://cs50.harvard.edu/x/2023/psets/6/dna/ |
| 3 | +""" |
| 4 | + |
| 5 | +import csv |
| 6 | +import sys |
| 7 | + |
| 8 | +def main(): |
| 9 | + |
| 10 | + database = [] |
| 11 | + results = {} |
| 12 | + |
| 13 | + # Check for command-line usage. |
| 14 | + if len(sys.argv) != 3: |
| 15 | + print("Usage: python dna.py DATABASE SEQUENCE") |
| 16 | + |
| 17 | + # Read database file into a variable. |
| 18 | + with open(sys.argv[1]) as file: |
| 19 | + reader = csv.reader(file) |
| 20 | + |
| 21 | + for row in reader: |
| 22 | + database.append(row) |
| 23 | + |
| 24 | + # Read DNA sequence file into a variable. |
| 25 | + with open(sys.argv[2]) as file: |
| 26 | + sequence = file.read() |
| 27 | + |
| 28 | + # Find longest match of each STR in DNA sequence. |
| 29 | + # Get the STRs from the CSV file by iterating through the row starting from 1 so we omit 'name'. |
| 30 | + for i in range(1, len(database[0])): |
| 31 | + # Each iteration save the STR in a variable. |
| 32 | + subsequence = "".join(x for x in database[0][i]) |
| 33 | + |
| 34 | + # In the 'results' dictionary, save each STR along with the length of the longest run. |
| 35 | + results[subsequence] = longest_match(sequence, subsequence) |
| 36 | + |
| 37 | + # Convert the results of the STRs to a list containing only the values. |
| 38 | + sequence_dna = list(results.values()) |
| 39 | + |
| 40 | + # Check database for matching profiles by iterating through the CSV file and getting the STRs for each person. |
| 41 | + for person in database[1:][0:]: |
| 42 | + # Save the STR values to a list. |
| 43 | + person_dna = [int(i) for i in person[1:]] |
| 44 | + |
| 45 | + # Compare the two lists and print the name of the person if there's a match. |
| 46 | + if person_dna == sequence_dna: |
| 47 | + sys.exit(person[0]) |
| 48 | + |
| 49 | + # Print 'No match' if there wasn't a match. |
| 50 | + print("No match") |
| 51 | + |
| 52 | + |
| 53 | +def longest_match(sequence, subsequence): |
| 54 | + """Returns length of longest run of subsequence in sequence.""" |
| 55 | + |
| 56 | + # Initialize variables |
| 57 | + longest_run = 0 |
| 58 | + subsequence_length = len(subsequence) |
| 59 | + sequence_length = len(sequence) |
| 60 | + |
| 61 | + # Check each character in sequence for most consecutive runs of subsequence. |
| 62 | + for i in range(sequence_length): |
| 63 | + |
| 64 | + # Initialize count of consecutive runs. |
| 65 | + count = 0 |
| 66 | + |
| 67 | + # Check for a subsequence match in a "substring" (a subset of characters) within sequence. |
| 68 | + # If a match, move substring to next potential match in sequence. |
| 69 | + # Continue moving substring and checking for matches until out of consecutive matches. |
| 70 | + while True: |
| 71 | + |
| 72 | + # Adjust substring start and end |
| 73 | + start = i + count * subsequence_length |
| 74 | + end = start + subsequence_length |
| 75 | + |
| 76 | + # If there is a match in the substring. |
| 77 | + if sequence[start:end] == subsequence: |
| 78 | + count += 1 |
| 79 | + |
| 80 | + # If there is no match in the substring. |
| 81 | + else: |
| 82 | + break |
| 83 | + |
| 84 | + # Update most consecutive matches found. |
| 85 | + longest_run = max(longest_run, count) |
| 86 | + |
| 87 | + # After checking for runs at each character in seqeuence, return longest run found. |
| 88 | + return longest_run |
| 89 | + |
| 90 | + |
| 91 | +if __name__ == "__main__": |
| 92 | + main() |
0 commit comments