This shows you the differences between two versions of the page.
— |
cs-142:gene-finding-via-gc-content [2015/05/12 18:26] (current) cs142ta created |
||
---|---|---|---|
Line 1: | Line 1: | ||
+ | =Gene Finding via GC content= | ||
+ | ==Problem== | ||
+ | * Within a long region of genomic sequence, genes are often characterised by having a higher GC-content in contrast to the background GC-content for the entire genome. | ||
+ | * Write a program to prompt the user for a string of DNA bases (ACTG) | ||
+ | * Calculate the ratio of G’s and C’s | ||
+ | * If the ratio is > .60, report that the sequence is likely a gene, otherwise report it is probably not. | ||
+ | |||
+ | A contextually-similar problem is [[Gene Finding via TATA box search]]. | ||
+ | |||
+ | ==Solution== | ||
+ | <code cpp> | ||
+ | /* | ||
+ | Test Case 1: | ||
+ | Input: A (this is an example of a string with no Gs or Cs) | ||
+ | Expected Output: 0, not a gene | ||
+ | Actual Output: 0, not a gene | ||
+ | |||
+ | Test Case 2: | ||
+ | Input: G (this is an example of a string with only Gs and Cs) | ||
+ | Expected Output: 1, probably a gene | ||
+ | Actual Output: 1, probably a gene | ||
+ | |||
+ | Test Case 3: | ||
+ | Input: ACATAGACTAG (this is an example of a string with a mix of all four bases) | ||
+ | Expected Output: .36, probably not a gene | ||
+ | Actual Output: .36, probably not a gene | ||
+ | */ | ||
+ | |||
+ | #include <iostream> | ||
+ | #include <string> | ||
+ | |||
+ | using namespace std; | ||
+ | |||
+ | int main() | ||
+ | { | ||
+ | // Inputs: DNA sequence | ||
+ | // Outputs ratio of G&C to total, and prediction of whether or not it's a gene | ||
+ | |||
+ | // Prompt the user | ||
+ | cout << "Please input a DNA sequence: "; | ||
+ | string dna_string; | ||
+ | cin >> dna_string; | ||
+ | |||
+ | int gc_count = 0; | ||
+ | // Calculate GC-content | ||
+ | for (int i = 0; i < dna_string.length(); i++) | ||
+ | { | ||
+ | // Count the number of Gs and Cs | ||
+ | string base_at_i = dna_string.substr(i, 1); | ||
+ | |||
+ | // The following statement can be uncommented to check that our loop is working correctly. | ||
+ | // cout << "base at i is " << base_at_i << endl; | ||
+ | |||
+ | if (base_at_i == "G" || base_at_i == "C") | ||
+ | { | ||
+ | gc_count++; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | cout << "Total GC count was " << gc_count << endl; | ||
+ | |||
+ | // Divide by total string length to get ratio, making sure to account for integer division problems. | ||
+ | double gc_ratio = 1.0 * gc_count / dna_string.length(); | ||
+ | cout << "GC ratio for sequence was " << gc_ratio << endl; | ||
+ | |||
+ | // if ratio > .60, then report it's a gene | ||
+ | if (gc_ratio > .6) | ||
+ | { | ||
+ | cout << "You've got a gene on your hands!" << endl; | ||
+ | } | ||
+ | else | ||
+ | { | ||
+ | cout << "Probably not a gene..." << endl; | ||
+ | } | ||
+ | |||
+ | system("pause"); | ||
+ | return 0; | ||
+ | } | ||
+ | </code> |