// Copyright Mirela Andronescu 2005-2008 // Predicts and outputs minimum free energy secondary structures with new parameters; // also outputs the accuracy of the new predictions // Takes as input: // - a set of new parameters // - a set of sequences with known and initial structures // Outputs: // - a file with sequences, known and previously predicted structures, and the newly predicted structures // - a file with the sensitivity and positive predictive values for the newly predicted structures // You need the following functions (see below for details): // - a function that takes as input a file with a set of parameters, and fills your internal data structures // - a function that predicts the minimum free energy (or low free energy) secondary structure for a given sequence // - if your data has secondary structure constraints, you need a variant of the // prediction function to accomodate constraints // - a function that computes the sensitivity of prediction // - a function that computes the positive predictive value of the prediction /////////////////////////////////////////////////////////////////// // MODIFY THIS PART // Include your own header files #include "simfold.h" #include "common.h" #include "params.h" #include "externs.h" // Make sure this number for the maximum length of the sequences is large enough #define MAXLEN 5000 // maximum length of sequences // Initialize the data, fill the parameter data structures // You can write anything here // "\" separates lines in the define directive #define INITIALIZE \ char config_file[200] = "../../../MultiRNAFold/params/multirnafold.conf";\ init_data (argv[0], config_file, RNA, 37.0); // Provide a function that takes as input a file with a set of parameters, and fills your internal data structures // Example: void fill_data_structures_with_new_parameters (char *parameters_file) // The parameters_file should have one parameter value per line. // For example, if you have 3 parameters in your model, with values 1.00, 2.00, and 3.00, the file should be: // 1.00 // 2.00 // 3.00 // (without the "// ", of course). #define FUNCTION_READ_PARAMETERS_FROM_FILE fill_data_structures_with_new_parameters // If there is a chance there might be structures in the training or testing data sets // that DO NOT have structural constraints, then set UNCONSTRAINTED_STRUCTURES to 1 #define UNCONSTRAINED_STRUCTURES 1 // If the above is 1, then you have to provide a function that predicts // the minimum free energy (or a low free energy) secondary structure. Otherwise ignore this. #if (UNCONSTRAINED_STRUCTURES == 1) // Prototype: double simfold (char *sequence, char *structure) // - sequence is the given RNA sequence to be folded (input parameter) // - structure is the predicted secondary structure (output parameter) // - the function may return the energy, or return nothing, it doesn't matter #define FUNCTION_PREDICTOR simfold #endif // If there is a chance there might be structures in the training or testing data sets // that DO have structural constraints, then set CONSTRAINTED_STRUCTURES to 1 // Note that the training or testing sets might have both constrained and unconstrained structures #define CONSTRAINED_STRUCTURES 1 // If CONSTRAINED_STRUCTURES is 1, then you have to provide a variant of the prediction function // that takes the restricted string as input // If CONSTRAINED_STRUCTURES is 0, it doesn't matter what you write here #if (CONSTRAINED_STRUCTURES == 1) // Prototype: double simfold_restricted (char *sequence, char *restricted, char *structure) // - restricted is a given restriction string // (dots and parentheses for the restricted parts, _ (underscore) for unrestricted) #define FUNCTION_PREDICTOR_CONSTRAINED simfold_restricted #endif // Provide a function that computes the sensitivity of the predicted structure vs. the reference structure // The definition that I use is: number of correctly predicted base pairs / number of true base pairs // Note: if number of true base pairs is 0, I return -1. // Prototype: double compute_sensitivity (char *reference_structure, char *predicted_structure) // - reference_structure is the reference structure, considered ground truth // - predicted_structure is some predicted structure // - returns sensitivity in [0,1] or -1 if denominator is 0 #define FUNCTION_COMPUTE_SENSITIVITY compute_sensitivity // Provide a function that computes the positive predictive value (ppv) of the predicted structure // vs. the reference structure // The definition that I use is: number of correctly predicted base pairs / number of predicted base pairs // Note: if number of predicted base pairs is 0, I return -1. // Prototype: double compute_ppv (char *reference_structure, char *predicted_structure) // - reference_structure is the reference structure, considered ground truth // - predicted_structure is some predicted structure // - returns ppv in [0,1] or -1 if denominator is 0 #define FUNCTION_COMPUTE_PPV compute_ppv // PLEASE LET ME KNOW IF YOU WANT TO USE OTHER ACCURACY FUNCTIONS // That's it, please let me know if it doesn't compile // DO NOT MODIFY ANYTHING FROM NOW ON! /////////////////////////////////////////////////////////////////// #include #include #include #include int main (int argc, char *argv[]) { if (argc != 5) { printf ("Usage: %s \n", argv[0]); exit(0); } char sequence[MAXLEN]; char known_structure[MAXLEN]; char restricted[MAXLEN]; char initial_structure[MAXLEN]; char new_structure[MAXLEN]; double energy; char new_params_filename[200]; char oldset_filename[200]; char newset_filename[200]; FILE *oldset_file, *newset_file; int i, j, k; char buffer[MAXLEN]; char accuracy_output_filename[500]; FILE *accuracy_output_file; strcpy (new_params_filename, argv[1]); strcpy (oldset_filename, argv[2]); strcpy (newset_filename, argv[3]); strcpy (accuracy_output_filename, argv[4]); INITIALIZE; FUNCTION_READ_PARAMETERS_FROM_FILE (new_params_filename); j = 0; double sens_initial_train, sens_new_train; double ppv_initial_train, ppv_new_train; ////////////////////////////////////////////////////// // open the old set, to get the sequence, the true structure and the turner structure if ((oldset_file = fopen(oldset_filename, "r")) == NULL) { printf ("Cannot open file %s\n", oldset_filename); exit(0); } if ((newset_file = fopen(newset_filename, "w")) == NULL) { printf ("Cannot open file %s\n", newset_filename); exit(0); } if ((accuracy_output_file = fopen(accuracy_output_filename, "w")) == NULL) { printf ("Cannot open file %s\n", accuracy_output_filename); exit(0); } //the format is: // > header // sequence // known structure // [restricted] // initial structure // pred structure 1 // pred structure 2 etc // empty line // read the header line here fgets (buffer, sizeof(buffer), oldset_file); // header "> SSTRAND_ID etc" while (!feof(oldset_file)) { fprintf (newset_file, "%s", buffer); fgets (sequence, sizeof(buffer), oldset_file); // sequence fprintf (newset_file, "%s", sequence); fgets (known_structure, sizeof(buffer), oldset_file); // known structure fprintf (newset_file, "%s", known_structure); // next one is either restricted or initial structure fgets (buffer, sizeof(buffer), oldset_file); // header "> SSTRAND_ID etc" if ( strstr (buffer, "_") != NULL) // it was the restricted line { strcpy (restricted, buffer); fprintf (newset_file, "%s", buffer); fgets (initial_structure, sizeof(buffer), oldset_file); // empty line fprintf (newset_file, "%s", initial_structure); } else // it was initial_structure { restricted[0] = '\0'; strcpy (initial_structure, buffer); fprintf (newset_file, "%s", initial_structure); } for (i=0; i < strlen (sequence); i++) { if(sequence[i] == '\n') { sequence[i] = '\0'; known_structure[i] = '\0'; initial_structure[i] = '\0'; break; } } for (i=0; i < strlen (restricted); i++) { if(restricted[i] == '\n') restricted[i] = '\0'; } // predict the new structure, write it in the new set file, and measure its accuracy //printf ("\nS: %s\nR: %s\n", sequence, restricted); if (strcmp (restricted, "") == 0) { if (UNCONSTRAINED_STRUCTURES == 0) { printf ("ERROR! constant UNCONSTRAINED_STRUCTURES is 0, but there are UNrestricted structures in your data set!\n"); exit(1); } #if (UNCONSTRAINED_STRUCTURES == 1) energy = FUNCTION_PREDICTOR (sequence, new_structure); #endif } else { if (CONSTRAINED_STRUCTURES == 0) { printf ("ERROR! constant CONSTRAINED_STRUCTURES is 0, but there are restricted structures in your data set!\n"); exit(1); } #if (CONSTRAINED_STRUCTURES == 1) energy = FUNCTION_PREDICTOR_CONSTRAINED (sequence, restricted, new_structure); #endif } //printf ("N: %s\n", new_structure); // add the new structure into the file only if it is different from known_structure AND from any other structure generated so far. int to_add = 1; // copy the rest of the structures into the new set file while (!feof(oldset_file)) { fgets (buffer, sizeof(buffer), oldset_file); // some other structure if (buffer[0] == '\n') break; buffer [strlen (sequence)] = '\0'; fprintf (newset_file, "%s\n", buffer); if (strcmp (new_structure, buffer) == 0) to_add = 0; } if (strcmp (new_structure, known_structure) == 0) to_add = 0; if (to_add) fprintf (newset_file, "%s\n", new_structure); fprintf (newset_file, "\n"); fflush (newset_file); sens_initial_train = FUNCTION_COMPUTE_SENSITIVITY (known_structure, initial_structure); ppv_initial_train = FUNCTION_COMPUTE_PPV (known_structure, initial_structure); sens_new_train = FUNCTION_COMPUTE_SENSITIVITY (known_structure, new_structure); ppv_new_train = FUNCTION_COMPUTE_PPV (known_structure, new_structure); fprintf (accuracy_output_file, "Initial set sens: %.3lf, Initial set ppv: %.3lf, New set sens: %.3lf, New set ppv: %.3lf\n", sens_initial_train, ppv_initial_train, sens_new_train, ppv_new_train); j++; fgets (buffer, sizeof(buffer), oldset_file); // header "> SSTRAND_ID etc" } fclose (oldset_file); fclose (newset_file); fclose (accuracy_output_file); return 0; }