// Copyright Mirela Andronescu 2005-2008 // Takes as input a parameter set, a file set*.txt, with sequence and various predictions // (including predictions with the latest parameter set), and creates linear constraints for CPLEX // Functions that you need for this file: // - A function that returns the number of parameters used in your model // - A function that takes as input a file with a set of parameters, and fills your internal data structures // - A function that computes how many times each feature occurs in a given structure and the free energy // - [optionally] A function that computes the free energy (under your model) of a sequence folded into a given structure. /////////////////////////////////////////////////////////////////// // MODIFY THIS PART // include your own header files #include "simfold.h" #include "common.h" #include "externs.h" #include "params.h" // Make sure this number for the maximum length of the sequences is large enough #define MAXLEN 5000 // maximum length of sequences // Initialize the data, fill the parameter data structures // You can write anything here // "\" separates lines in the define directive // If you have nothing to initialize, leave it blank: #define INITIALIZE #define INITIALIZE \ char config_file[200] = "../../../MultiRNAFold/params/multirnafold.conf";\ init_data (argv[0], config_file, RNA, 37.0); //fill_data_structures_with_new_parameters ("../../../MultiRNAFold/params/turner_parameters_fm363_constrdangles.txt"); // Provide a function that takes as input a file with a set of parameters, and fills your internal data structures // Prototype: void fill_data_structures_with_new_parameters (char *parameters_file) // The parameters_file should have one parameter value per line. // For example, if you have 3 parameters in your model, with values 1.00, 2.00, and 3.00, the file should be: // 1.00 // 2.00 // 3.00 // (without the "// ", of course). #define FUNCTION_READ_PARAMETERS_FROM_FILE fill_data_structures_with_new_parameters // Provide the name of a function that returns the number of parameters in your model // Prototype: int get_num_params() #define FUNCTION_GET_NUMBER_OF_PARAMETERS get_num_params // Specify whether your energy function is LINEAR or QUADRATIC // * If your energy function is LINEAR in the energy parameters, // then your energy function can be written like this: // - deltaG = c' x + f // - where x is the vector of parameters // - c is the vector of how many times each parameter occurs in the given structure // - c' means c transposed // - f is a constant // - As a simple example, if your model has 3 parameters x1, x2 and x3, and deltaG for some structure is // x1 + x3 + x3 + 0.5, then c'=(1,0,2) and f=0.5; // - The Turner model underlying mfold, simfold, RNAstructure, RNA Vienna package etc is linear. // * If your energy function is QUADRATIC in the energy parameters, // then your energy function can be written like this: // - deltaG = x' P x + c' x + f // - where x is the vector of parameters // - P is a symmetric matrix of the coefficients for each quadratic term // - c is a vector of counts for each linear term // - c' means c transposed // - f is a constant // - As a simple example, if your model has 3 parameters x1, x2 and x3, and deltaG for some structure is // x1*x2 + 0.5*x2*x3 + 2*x2, then P = (0,1,0; 1,0,0.5; 0,0.5,0), c' = (0,2,0) and f=0. // - The Dirks&Pierce and Rivas&Eddy models for pseudoknotted structures are quadratic. #define ENERGY_FUNCTION_DEGREE LINEAR //#define ENERGY_FUNCTION_DEGREE QUADRATIC // Provide the name of a function that returns the number of occurences (counts) for each feature (see below). // * If ENERGY_FUNCTION_DEGREE above is LINEAR, the prototype is: // double get_feature_counts (char *sequence, char *structure, double *linear_vector, double &free_value); // - sequence is a given sequence (input parameter) // - structure is a given secondary structure (input parameter) // - linear_vector is the c vector above, an array of doubles (output parameter). // The function writes the number of occurences for each feature in this vector, // for example linear_vector[0] is the number of times the first feature occurs // - free_value is the free parameter f above. // - assume the memory space was allocated for all input and output variables // - return the free_energy of sequence folded into structure. Free_energy should be = c' x + f. // * If ENERGY_FUNCTION_DEGREE above is QUADRATIC, the prototype is: // double get_feature_counts_quadratic (char *sequence, char *structure, double **quadratic_matrix, double *linear_vector, double &free_value); // - quadratic_matrix is the P matrix above. Assume the space for it is already allocated. // Since the matrix is symmetric, we need only the upper triangle, so only fill it for i <= j. // - linear_vector is the c vector above. Assume the space for it is already allocated. // - free_value is the free parameter f above. // - return the free_energy of sequence folded into structure. Free_energy should be = x' P x + c' x + f. #define FUNCTION_GET_FEATURE_COUNTS get_feature_counts //#define FUNCTION_GET_FEATURE_COUNTS get_feature_counts_quadratic // If you have another function which computes the free energy of a sequence folded into a given structure, // it is a good idea to check whether FUNCTION_GET_FEATURE_COUNTS returns the same value // First set HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION is that's the case #define HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION 1 // If the above is 1, then provide the free energy function // If the above is 0, leave this unchanged. // Prototype: double free_energy_simfold (char *sequence, char *structure); // returns the free energy (in kcal/mol) of sequence folded into structure #if (HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION == 1) #define FUNCTION_FREE_ENERGY free_energy_simfold #endif // That's it, please let me know if it doesn't compile // DO NOT MODIFY ANYTHING FROM NOW ON! /////////////////////////////////////////////////////////////////// // standard header files #include #include #include #include #define LINEAR 1 #define QUADRATIC 2 int read_params_from_file (char *filename, double *params) // read parameters (one per row) in filename, and fill the params array. // return the number of parameters { FILE *file; int i = 0; char buffer [500]; if ((file = fopen (filename, "r")) == NULL) { printf ("Cannot open file %s, ABORT!\n", filename); exit(1); } fgets (buffer, sizeof(buffer), file); while (!feof (file)) { sscanf (buffer, "%lf\n", ¶ms[i++]); fgets (buffer, sizeof(buffer), file); } fclose(file); return i; } void reset_linear (int num_parameters, double *array) { for (int i=0; i < num_parameters; i++) { array[i] = 0; } } void reset_quadratic (int num_parameters, double **array) { for (int i=0; i < num_parameters; i++) { for (int j=i; j < num_parameters; j++) { array[i][j] = 0; } } } #define PARAMTYPE double int main(int argc, char *argv[]) { if (argc != 5) { printf ("Usage: %s \n", argv[0]); exit(0); } // variables FILE *set_file; FILE *constraints_file; FILE *num_constraints_permol_file; char new_params_filename[500]; char set_filename[500]; char constraints_filename[500]; char num_constraints_permol_filename[500]; int i, j; char sequence[MAXLEN]; char structure_known[MAXLEN]; char restricted[MAXLEN]; char structure_pred[MAXLEN]; char buffer [5000]; char header [MAXLEN]; double *linear_vector_known; double *linear_vector_pred; double free_value_known; double free_value_pred; char constraint[50000]; int num_constraints_permol[30000]; for (i=0; i < 30000; i++) { num_constraints_permol[i] = 0; } int num_molecules = 0; double accuracy_threshold = 1.0; // include only those constraints which have accuracy < threshold double energy_fe, energy_this; int replace_instead_of_add = 0; // replacing the old constraints gives very poor results, so this should be 0 char parameter_filename[500]; double alt_energy; // get the input arguments strcpy (new_params_filename, argv[1]); strcpy (set_filename, argv[2]); strcpy (constraints_filename, argv[3]); strcpy (num_constraints_permol_filename, argv[4]); // In simfold, I need to initialize the thermodynamic parameter once INITIALIZE; // fill the internal data structures with the new parameters FUNCTION_READ_PARAMETERS_FROM_FILE (new_params_filename); // I need to know how many parameters there are in the model int num_parameters = FUNCTION_GET_NUMBER_OF_PARAMETERS(); double *params_array; params_array = new double[num_parameters]; int n = read_params_from_file (new_params_filename, params_array); if (n != num_parameters) { printf ("ERROR! Number of parameters in file %s is %d and is NOT the same as what the function %s reported: %d!\n", new_params_filename, n, FUNCTION_GET_NUMBER_OF_PARAMETERS, num_parameters); exit(1); } // now params_array is filled with the values in the given parameter file // Allocate the space for the counters linear_vector_known = new double[num_parameters]; if (linear_vector_known == NULL) { printf ("ERROR! Space could not be allocated for linear_vector_known, ABORT!\n"); exit(1); } linear_vector_pred = new double[num_parameters]; if (linear_vector_pred == NULL) { printf ("ERROR! Space could not be allocated for linear_vector_pred, ABORT!\n"); exit(1); } // allocate the two dimensional array if applicable #if (ENERGY_FUNCTION_DEGREE == QUADRATIC) double **quadratic_matrix_known; quadratic_matrix_known = new double*[num_parameters]; if (quadratic_matrix_known == NULL) { printf ("ERROR! Space could not be allocated for quadratic_matrix_known, ABORT!\n"); exit(1); } for (i=0; i < num_parameters; i++) { // I tried to allocate only i+1 elements, but it crashes during delete - dunno why quadratic_matrix_known[i] = new double[num_parameters]; if (quadratic_matrix_known[i] == NULL) { printf ("ERROR! Space could not be allocated for quadratic_matrix_known[%d], ABORT!\n", i); exit(1); } } double **quadratic_matrix_pred; quadratic_matrix_pred = new double*[num_parameters]; if (quadratic_matrix_pred == NULL) { printf ("ERROR! Space could not be allocated for quadratic_matrix_pred, ABORT!\n"); exit(1); } for (i=0; i < num_parameters; i++) { // I tried to allocate only i+1 elements, but it crashes during delete - dunno why quadratic_matrix_pred[i] = new double[num_parameters]; if (quadratic_matrix_pred[i] == NULL) { printf ("ERROR! Space could not be allocated for quadratic_matrix_known[%d], ABORT!\n", i); exit(1); } } #endif // open the files if ((constraints_file = fopen (constraints_filename,"w")) == NULL) { printf ("ERROR: Cannot open file %s\n", constraints_filename); return 0; } if ((set_file = fopen (set_filename, "r")) == NULL) { printf ("ERROR: Cannot open file %s\n", set_filename); exit (0); } int k = 0; int delta_index = 0; //the format is: // > header // sequence // known structure // [restricted] // pred structure 1 // pred structure 2 // empty line // start reading the input set file fgets (header, sizeof(buffer), set_file); // header "> SSTRAND_ID etc" while (!feof(set_file)) { fgets (sequence, sizeof(buffer), set_file); // sequence fgets (structure_known, sizeof(buffer), set_file); // known structure fgets (buffer, sizeof(buffer), set_file); // predicted structure or restricted line for (j=0; j < sizeof (buffer); j++) { if (j >= MAXLEN) { printf ("ERROR! MAXLEN=%d is too small!\n", MAXLEN); printf ("%s\n", buffer); exit(1); } if (buffer[j] == '\n') { buffer[j] = '\0'; break; } } if ( strstr (buffer, "_") != NULL) // it was the restricted line { strcpy (restricted, buffer); fgets (structure_pred, sizeof(buffer), set_file); } else // it was structure pred { restricted[0] = '\0'; strcpy (structure_pred, buffer); } for (j=0; j < sizeof(buffer); j++) { if (j >= MAXLEN) { printf ("ERROR! MAXLEN=%d is too small!\n", MAXLEN); printf ("%s\n", sequence); exit(1); } if(sequence[j] == '\n') { sequence[j] = '\0'; structure_known[j] = '\0'; structure_pred[j] = '\0'; break; } } for (j=0; j < strlen (restricted); j++) if(restricted[j] == '\n') restricted[j] = '\0'; for (j=0; j < strlen (header); j++) if(header[j] == '\n') header[j] = '\0'; //printf ("\n%s\n%s\n%d\n", sequence, structure_known, delta_index); // now I have the sequence and known structure // Get the counts for the known structure // First reset the counters reset_linear (num_parameters, linear_vector_known); #if (ENERGY_FUNCTION_DEGREE == LINEAR) energy_fe = FUNCTION_GET_FEATURE_COUNTS (sequence, structure_known, linear_vector_known, free_value_known); if (!check_counts_linear (num_parameters, params_array, linear_vector_known, free_value_known, energy_fe)) { // message is printed from check_counts_linear printf ("%s\n%s\n", sequence, structure_known); exit(1); } #elif (ENERGY_FUNCTION_DEGREE == QUADRATIC) reset_quadratic (num_parameters, quadratic_matrix_known); energy_fe = FUNCTION_GET_FEATURE_COUNTS (sequence, structure_known, quadratic_matrix_known, linear_vector_known, free_value_known); if (!check_counts_quadratic (num_parameters, params_array, quadratic_matrix_known, linear_vector_known, free_value_known, energy_fe)) { // message is printed from check_counts_linear printf ("%s\n%s\n", sequence, structure_known); exit(1); } #else printf ("ERROR! Constant ENERGY_FUNCTION_DEGREE should be either LINEAR or QUADRATIC! Abort.\n"); exit(1); #endif // also check if energy_fe is the same as the one provided separately #if (HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION == 1) alt_energy = FUNCTION_FREE_ENERGY (sequence, structure_known); if (fabs (alt_energy - energy_fe) > 0.1) { printf ("dG by %s is different from dG by %s: %.2lf vs. %.2lf for \n%s\n%s\n", FUNCTION_FREE_ENERGY, FUNCTION_GET_FEATURE_COUNTS, alt_energy, energy_fe, sequence, structure_known); exit(1); } #endif // Read all next predicted structures, up to empty line // The first structure pred has been read already above. // This is the structure predicted with the initial parameters while (!feof (set_file)) { if (structure_pred[0] == '\n') break; structure_pred[strlen(sequence)] = '\0'; //printf ("%s\n", structure_pred); fflush (stdout); // ignore it if the predicted structure is the same as the known structure if (strcmp(structure_known, structure_pred) == 0) { fgets (structure_pred, sizeof(buffer), set_file); continue; } // Get the counts for the predicted structure reset_linear (num_parameters, linear_vector_pred); #if (ENERGY_FUNCTION_DEGREE == LINEAR) energy_fe = FUNCTION_GET_FEATURE_COUNTS (sequence, structure_pred, linear_vector_pred, free_value_pred); if (!check_counts_linear (num_parameters, params_array, linear_vector_pred, free_value_pred, energy_fe)) { // message is printed from check_counts_linear printf ("%s\n%s\n", sequence, structure_pred); exit(1); } #elif (ENERGY_FUNCTION_DEGREE == QUADRATIC) reset_quadratic (num_parameters, quadratic_matrix_pred); energy_fe = FUNCTION_GET_FEATURE_COUNTS (sequence, structure_pred, quadratic_matrix_pred, linear_vector_pred, free_value_pred); if (!check_counts_quadratic (num_parameters, params_array, quadratic_matrix_pred, linear_vector_pred, free_value_pred, energy_fe)) { // message is printed from check_counts_linear printf ("%s\n%s\n", sequence, structure_pred); exit(1); } #else printf ("ERROR! Constant ENERGY_FUNCTION_DEGREE should be either LINEAR or QUADRATIC! Abort.\n"); exit(1); #endif // also check if energy_fe is the same as the one provided separately #if (HAVE_ALTERNATIVE_FREE_ENERGY_FUNCTION == 1) alt_energy = FUNCTION_FREE_ENERGY (sequence, structure_pred); if (fabs (alt_energy - energy_fe) > 0.1) { printf ("dG by %s is different from dG by %s: %.2lf vs. %.2lf for \n%s\n%s\n", FUNCTION_FREE_ENERGY, FUNCTION_GET_FEATURE_COUNTS, alt_energy, energy_fe, sequence, structure_pred); exit(1); } #endif // print sequence and structures as comments, but they shouldn't be longer than some 500 or so. char tmps[90]; int len = strlen(sequence); int index = 0; int wl = 80; //fprintf (problem, "\n \\\\ %s", header); while (index < len) { strncpy (tmps, sequence+index, wl); index += wl; } index = 0; while (index < len) { strncpy (tmps, structure_known+index, wl); index += wl; } index = 0; while (index < len) { strncpy (tmps, structure_pred+index, wl); index += wl; } // if (show_comments) // { // fprintf (constraints_file, " \\* %s *\\\n", sequence); // fprintf (constraints_file, " \\* %s *\\\n", structure_known); // fprintf (constraints_file, " \\* %s *\\\n", structure_pred); // fprintf (constraints_file, " \\* "); // } sprintf (constraint, " sc%d: ", delta_index); // structural constraints int l=0; #if (ENERGY_FUNCTION_DEGREE == QUADRATIC) for (i=0; i < num_parameters; i++) { for (j=i; j < num_parameters; j++) { if (quadratic_matrix_known[i][j] - quadratic_matrix_pred[i][j] != 0) { if (l > 0 && quadratic_matrix_known[i][j]-quadratic_matrix_pred[i][j] > 0) { sprintf (constraint, "%s + ", constraint); } if (quadratic_matrix_known[i][j]-quadratic_matrix_pred[i][j] == 1) { sprintf (constraint, "%s x%d x%d ", constraint, i, j); } else if (quadratic_matrix_known[i][j]-quadratic_matrix_pred[i][j] == -1) { sprintf (constraint, "%s - x%d x%d ", constraint, i, j); } else { sprintf (constraint, "%s %g x%d x%d ", constraint, quadratic_matrix_known[i][j]-quadratic_matrix_pred[i][j], i, j); } l++; if (l % 8 == 0) { sprintf (constraint, "%s\n ", constraint); } } } } #endif for (i=0; i < num_parameters; i++) { if (linear_vector_known[i]-linear_vector_pred[i] != 0) { if (l > 0 && linear_vector_known[i]-linear_vector_pred[i] > 0) { sprintf (constraint, "%s + ", constraint); // if (show_comments) // fprintf (constraints_file, " + "); } if (linear_vector_known[i]-linear_vector_pred[i] == 1) { sprintf (constraint, "%s x%d ", constraint, i); // if (show_comments) // fprintf (constraints_file, " %s ", string_params[i]); } else if (linear_vector_known[i]-linear_vector_pred[i] == -1) { sprintf (constraint, "%s - x%d ", constraint, i); // if (show_comments) // fprintf (constraints_file, " - %s ", string_params[i]); } else { sprintf (constraint, "%s %g x%d ", constraint, linear_vector_known[i]-linear_vector_pred[i], i); // if (show_comments) // fprintf (constraints_file, " %.1lf %s ", linear_vector_known[i]-linear_vector_pred[i], string_params[i]); } l++; if (l % 10 == 0) { sprintf (constraint, "%s\n ", constraint); // if (show_comments) // fprintf (constraints_file, " *\\\n \\* "); } } } // if (show_comments) // fprintf (constraints_file, " *\\\n"); // no idea what the difference here is if (replace_instead_of_add) { sprintf (constraint, "%s - d%d < %.2lf\n\n", constraint, delta_index, free_value_pred-free_value_known); } else { sprintf (constraint, "%s - d%d < %.2lf\n\n", constraint, delta_index, free_value_pred-free_value_known); } fprintf (constraints_file, "%s", constraint); delta_index++; num_constraints_permol[num_molecules]++; fgets (structure_pred, sizeof(buffer), set_file); } // end while structures for this sequence num_molecules++; fgets (header, sizeof(buffer), set_file); // header "> SSTRAND_ID etc" } // end big while fclose (constraints_file); fclose (set_file); // now write the number of constraints per molecule in a file if ((num_constraints_permol_file = fopen (num_constraints_permol_filename,"w")) == NULL) { printf ("ERROR: Cannot open file %s\n", num_constraints_permol_filename); return 0; } for (i=0; i < num_molecules; i++) fprintf (num_constraints_permol_file, "%d\n", num_constraints_permol[i]); fclose (num_constraints_permol_file); delete [] linear_vector_known; delete [] linear_vector_pred; #if (ENERGY_FUNCTION_DEGREE == QUADRATIC) for (i=0; i < num_parameters; i++) { delete [] quadratic_matrix_known[i]; delete [] quadratic_matrix_pred[i]; } delete [] quadratic_matrix_known; delete [] quadratic_matrix_pred; #endif delete [] params_array; return 0; }