// Copyright Mirela Andronescu 2005-2008 // Takes as input a file set*.txt, with sequence and various predictions (including predictions with the latest parameter set), and creates linear constraints for CPLEX // Functions that you need for this file: // - A function that returns the number of parameters used in your model // - A function that computes how many times each feature occurs in a given structure // - [optional] If you want to check if your counts are fine, you need a function that fills the array params_array // - [optional] If you want to check the counts are correct, you need a function that computes the free energy /////////////////////////////////////////////////////////////////// // MODIFY THIS PART // include your own header files #include "simfold.h" #include "common.h" #include "externs.h" #include "params.h" // Make sure this number for the maximum length of the sequences is large enough #define MAXLEN 5000 // maximum length of sequences // Make sure this number for the maximum number of parameters in your model is large enough #define MAXNUMPARS 1000 // maximum number of parameters // Initialize the data, fill the parameter data structures // You can write anything here // "\" separates lines in the define directive #define INITIALIZE \ char config_file[200] = "../../../MultiRNAFold/params/multirnafold.conf";\ init_data (argv[0], config_file, RNA, 37.0); //fill_data_structures_with_new_parameters ("../../../MultiRNAFold/params/turner_parameters_fm363_constrdangles.txt"); // Provide the name of a function that returns the number of parameters in your model // Example: int get_num_params() #define FUNCTION_GET_NUMBER_OF_PARAMETERS get_num_params // Provide the name of a function that returns the number of occurences (counts) for each feature // Example: void get_feature_counts (char *sequence, char *structure, double *counter); // - sequence is a given sequence (input parameter) // - structure is a given secondary structure (input parameter) // - counter is an array of doubles (output parameter). // The function writes the number of occurences for each feature in the counter array // for example counter[0] is the number of times the first feature occurs #define FUNCTION_GET_FEATURE_COUNTS get_feature_counts // if you want to check your functions, set DEBUG to 1 and replace the function definitions below #define DEBUG 0 // TODO function definitions // DO NOT MODIFY ANYTHING FROM NOW ON! // Please let me know if it doesn't compile /////////////////////////////////////////////////////////////////// // standard header files #include #include #include #include #define PARAMTYPE double int main(int argc, char *argv[]) { if (argc != 4) { printf ("Usage: %s \n", argv[0]); exit(0); } // variables FILE *set_file; FILE *constraints_file; FILE *num_constraints_permol_file; char set_filename[500]; char constraints_filename[500]; char num_constraints_permol_filename[500]; int i, j; char sequence[MAXLEN]; char structure_known[MAXLEN]; char restricted[MAXLEN]; char structure_pred[MAXLEN]; char buffer [5000]; char header [MAXLEN]; double counter_known[MAXNUMPARS], counter_other[MAXNUMPARS]; char constraint[50000]; int num_constraints_permol[30000]; for (i=0; i < 30000; i++) { num_constraints_permol[i] = 0; } int num_molecules = 0; double accuracy_threshold = 1.0; // include only those constraints which have accuracy < threshold double energy_fe, energy_this; int replace_instead_of_add = 0; // replacing the old constraints gives very poor results, so this should be 0 char parameter_filename[500]; // get the input arguments strcpy (set_filename, argv[1]); strcpy (constraints_filename, argv[2]); strcpy (num_constraints_permol_filename, argv[3]); // In simfold, I need to initialize the thermodynamic parameter once INITIALIZE; // I need to know how many parameters there are in the model int num_parameters = FUNCTION_GET_NUMBER_OF_PARAMETERS(); // For debugging purposes (i.e. in case show_comments below is 1, // it's good to have a function that gives you the feature names int show_comments = 0; // If it's 1, write the feature names in the file #if (DEBUG == 1) PARAMTYPE params_array[MAXNUMPARS]; create_string_params (); save_parameters_in_array (params_array); show_comments = 1; #endif // open the files if ((constraints_file = fopen (constraints_filename,"w")) == NULL) { printf ("ERROR: Cannot open file %s\n", constraints_filename); return 0; } if ((set_file = fopen (set_filename, "r")) == NULL) { printf ("ERROR: Cannot open file %s\n", set_filename); exit (0); } int k = 0; int delta_index = 0; //the format is: // > header // sequence // known structure // [restricted] // pred structure 1 // pred structure 2 // empty line // start reading the input set file fgets (header, sizeof(buffer), set_file); // header "> SSTRAND_ID etc" while (!feof(set_file)) { fgets (sequence, sizeof(buffer), set_file); // sequence fgets (structure_known, sizeof(buffer), set_file); // known structure fgets (buffer, sizeof(buffer), set_file); // predicted structure or restricted line for (j=0; j < sizeof (buffer); j++) { if (j >= MAXLEN) { printf ("ERROR! MAXLEN=%d is too small!\n", MAXLEN); printf ("%s\n", buffer); exit(1); } if (buffer[j] == '\n') { buffer[j] = '\0'; break; } } if ( strstr (buffer, "_") != NULL) // it was the restricted line { strcpy (restricted, buffer); fgets (structure_pred, sizeof(buffer), set_file); } else // it was structure pred { restricted[0] = '\0'; strcpy (structure_pred, buffer); } for (j=0; j < sizeof(buffer); j++) { if (j >= MAXLEN) { printf ("ERROR! MAXLEN=%d is too small!\n", MAXLEN); printf ("%s\n", sequence); exit(1); } if(sequence[j] == '\n') { sequence[j] = '\0'; structure_known[j] = '\0'; structure_pred[j] = '\0'; break; } } for (j=0; j < strlen (restricted); j++) if(restricted[j] == '\n') restricted[j] = '\0'; for (j=0; j < strlen (header); j++) if(header[j] == '\n') header[j] = '\0'; //printf ("\n%s\n%s\n%d\n", sequence, structure_known, delta_index); // now I have the sequence and known structure // Get the counts for the known structure FUNCTION_GET_FEATURE_COUNTS (sequence, structure_known, counter_known); // For sanity, check the free energy computed with free_energy_simfold, and compare with c^T theta. // Erase this if you are not interested in this check or you don't have the function #if (DEBUG == 1) energy_this = 0; for (i=0; i < num_parameters; i++) energy_this += (int)(counter_known[i] * params_array[i]); energy_this += counter_known[num_parameters]; energy_this /= 100.0; if (restricted[0] == '\0') energy_fe = free_energy_simfold (sequence, structure_known); else energy_fe = free_energy_simfold_restricted (sequence, structure_known, restricted); if (fabs(energy_this - energy_fe) > 1) // 0.1 was too small for me, I was getting it sometimes if there were many int21 loops, which used 0.5*other params { printf ("ERROR! free energy known not computed correctly: this=%g, fe=%g, diff=%g\n", energy_this, energy_fe, energy_this-energy_fe); printf ("%s\n%s\n", sequence, structure_known); exit(1); } #endif // Read all next predicted structures, up to empty line // The first structure pred has been read already above. // This is the structure predicted with the initial parameters while (!feof (set_file)) { if (structure_pred[0] == '\n') break; structure_pred[strlen(sequence)] = '\0'; //printf ("%s\n", structure_pred); fflush (stdout); // ignore it if the predicted structure is the same as the known structure if (strcmp(structure_known, structure_pred) == 0) { fgets (structure_pred, sizeof(buffer), set_file); continue; } // Get the counts for the predicted structure FUNCTION_GET_FEATURE_COUNTS (sequence, structure_pred, counter_other); // For sanity, check the free energy computed with free_energy_simfold, and compare with c^T theta. // Erase this if you are not interested in this check or you don't have the function #if (DEBUG == 1) energy_this = 0; for (i=0; i < num_parameters; i++) energy_this += (int)(counter_other[i] * params_array[i]); energy_this += counter_other[num_parameters]; energy_this /= 100.0; if (restricted[0] == '\0') energy_fe = free_energy_simfold (sequence, structure_pred); else energy_fe = free_energy_simfold_restricted (sequence, structure_pred, restricted); if (fabs(energy_this - energy_fe) > 1) { PARAMTYPE params_array[MAXNUMPARS]; save_parameters_in_array (params_array); for (i=0; i <= num_parameters; i++) { if (counter_other[i] != 0) { printf ("%s=%d\t%d\t%.2lf\n", string_params[i], params_array[i], i, counter_other[i]); } } int show_comments = 1; energy_fe = free_energy_simfold (sequence, structure_pred); printf ("ERROR! free energy pred not computed correctly: this=%.2lf, fe=%.2lf\n", energy_this, energy_fe); printf ("%s\n%s\n", sequence, structure_pred); exit(1); } #endif // print sequence and structures as comments, but they shouldn't be longer than some 500 or so. char tmps[90]; int len = strlen(sequence); int index = 0; int wl = 80; //fprintf (problem, "\n \\\\ %s", header); while (index < len) { strncpy (tmps, sequence+index, wl); index += wl; } index = 0; while (index < len) { strncpy (tmps, structure_known+index, wl); index += wl; } index = 0; while (index < len) { strncpy (tmps, structure_pred+index, wl); index += wl; } if (show_comments) { fprintf (constraints_file, " \\* %s *\\\n", sequence); fprintf (constraints_file, " \\* %s *\\\n", structure_known); fprintf (constraints_file, " \\* %s *\\\n", structure_pred); fprintf (constraints_file, " \\* "); } sprintf (constraint, " sc%d: ", delta_index); // structural constraints int l=0; for (i=0; i < num_parameters; i++) { if (counter_known[i]-counter_other[i] != 0) { if (l > 0 && counter_known[i]-counter_other[i] > 0) { sprintf (constraint, "%s + ", constraint); if (show_comments) fprintf (constraints_file, " + "); } if (counter_known[i]-counter_other[i] == 1) { sprintf (constraint, "%s x%d ", constraint, i); if (show_comments) fprintf (constraints_file, " %s ", string_params[i]); } else if (counter_known[i]-counter_other[i] == -1) { sprintf (constraint, "%s - x%d ", constraint, i); if (show_comments) fprintf (constraints_file, " - %s ", string_params[i]); } else { sprintf (constraint, "%s %g x%d ", constraint, counter_known[i]-counter_other[i], i); if (show_comments) fprintf (constraints_file, " %.1lf %s ", counter_known[i]-counter_other[i], string_params[i]); } l++; if (l % 10 == 0) { sprintf (constraint, "%s\n ", constraint); if (show_comments) fprintf (constraints_file, " *\\\n \\* "); } } } if (show_comments) fprintf (constraints_file, " *\\\n"); if (replace_instead_of_add) { sprintf (constraint, "%s - d%d < %.2lf\n\n", constraint, delta_index, (double) ((counter_other[num_parameters]-counter_known[num_parameters])/100.0)); } else { sprintf (constraint, "%s - d%d < %.2lf\n\n", constraint, delta_index, (double) ((counter_other[num_parameters]-counter_known[num_parameters])/100.0)); } fprintf (constraints_file, "%s", constraint); delta_index++; num_constraints_permol[num_molecules]++; fgets (structure_pred, sizeof(buffer), set_file); } // end while structures for this sequence num_molecules++; fgets (header, sizeof(buffer), set_file); // header "> SSTRAND_ID etc" } // end big while fclose (constraints_file); fclose (set_file); // now write the number of constraints per molecule in a file if ((num_constraints_permol_file = fopen (num_constraints_permol_filename,"w")) == NULL) { printf ("ERROR: Cannot open file %s\n", num_constraints_permol_filename); return 0; } for (i=0; i < num_molecules; i++) fprintf (num_constraints_permol_file, "%d\n", num_constraints_permol[i]); fclose (num_constraints_permol_file); return 0; }