I'm really struggling with this exercise about linked lists and node removal.
Basically, DNA sequences are said to be complementary when each base in one sequence has the corresponding complement in the other sequence at the same position. Base A bonds with T, and base C bonds with G. For example, the sequences ACGT and TGCA are complementary.
I have to implement a function which takes two linked lists: the first is the original DNA and the second is the edit sequence. The function must apply the edit method described earlier and return the new list.
For instance, we've got "A C G T A G A C G T T C T A" as the original DNA sequence and "G C A" as the bonding sequence. G matches with C, C matches with G and A matches with T (and vice-versa). Therefore, "C G T" is "G C A"'s reflex. Then we have to remove "C", "G" and T", following this exact order, from the original DNA sequence, which turns out to be the expected result stated below.
Also, two things: 1. I must ignore any subsequent matches generated as a result of any node removals. 2. I cannot use any headers besides stdlib.h and functions such as malloc() or calloc(). I am only allowed to use free().
Input:
A C G T A G A C G T T C T A
G C A
Expected Result:
A A G A T C T A
Actual Result:
A A G A C G T T C T A
Any ideas would be hugely appreciated.
Thanks!
CodePudding user response:
I should have written a code based on your current code for better communication but I thought it will be simpler not to use the recursion. Would you please try:
#include <stdio.h>
#include <stdlib.h>
typedef struct LinkedNode {
char data;
struct LinkedNode *next;
} LinkedNode;
/*
* append the node with data at the end of the list
*/
void append(LinkedNode *head, char data)
{
LinkedNode *newNode = malloc(sizeof(LinkedNode));
newNode->data = data;
newNode->next = NULL;
LinkedNode *temp = head;
while (temp->next != NULL) {
temp = temp->next;
}
temp->next = newNode;
}
/*
* dump the data of the list
*/
void dump(LinkedNode *head)
{
for (head = head->next; head != NULL; head = head->next) {
printf("%c ", head->data);
}
printf("\n");
}
/*
* generate another list of complimentary sequence
*/
void gencomp(LinkedNode *dest, LinkedNode *src)
{
char s_data, d_data;
for (src = src->next; src != NULL; src = src->next) {
s_data = src->data;
switch(s_data) {
case 'A':
d_data = 'T';
break;
case 'T':
d_data = 'A';
break;
case 'C':
d_data = 'G';
break;
case 'G':
d_data = 'C';
break;
default:
fprintf(stderr, "unknown base: %c\n", s_data);
exit(1);
}
append(dest, d_data);
}
}
/*
* return the pointer to the last element if the subsequences match
*/
LinkedNode *match(LinkedNode *head, LinkedNode *comp)
{
for (; head != NULL; head = head->next, comp = comp->next) {
if (head->data != comp->data) {
return NULL;
} else if (comp->next == NULL) {
return head;
}
}
return NULL;
}
/*
* search "head" for "comp". If matched, the matched portion is skipped
* (not freed)
*/
void edit(LinkedNode *head, LinkedNode *comp)
{
LinkedNode *matched;
for (; head != NULL; head = head->next) {
if ((matched = match(head->next, comp->next))) {
head->next = matched->next; // skip matched sequence
}
}
}
int main()
{
char dna_orig[] = "ACGTAGACGTTCTA";
char dna_bond[] = "GCA";
int i;
// generate list of original sequence
LinkedNode *head = malloc(sizeof(LinkedNode));
head->next = NULL;
for (i = 0; dna_orig[i] != 0; i ) {
append(head, dna_orig[i]);
}
// generate list of bonding sequence
LinkedNode *bond = malloc(sizeof(LinkedNode));
bond->next = NULL;
for (i = 0; dna_bond[i] != 0; i ) {
append(bond, dna_bond[i]);
}
// generate list of complementary sequence of bond
LinkedNode *comp = malloc(sizeof(LinkedNode));
comp->next = NULL;
gencomp(comp, bond);
// edit the original sequence and see the result
edit(head, comp);
dump(head);
return 0;
}
Output:
A A G A T C T A
- I have created a list of complementary sequence of the bonding sequence to make the searching easy.
- The code is inefficient so far because it uses repeated linear search in the list.
- I had to include <stdio.h> for the printing purpose.
[Edit]
When freeing the skipped nodes, we cannot free them in the forward order
from head to tail, because the link to the next node is lost as soon as
the first node is freed. Then we can make use of recursion to free them
in the reverse order. Here is the function to free the skipped nodes:
void freenodes(LinkedNode *head, LinkedNode *tail)
{
if (head->next == tail) { // the last node to remove
free(head);
return;
}
freenodes(head->next, tail); // free nodes recursively
free(head);
}
The first argument to the function is advanced one by one until it reaches
the last node of the matched sequence. Then the function returns to the caller
stepping back to the first node of the matched sequence, while freeing the
nodes.
Here is the updated full code including the modification of edit
function
to return the head of the edited list.
#include <stdio.h>
#include <stdlib.h>
typedef struct LinkedNode {
char data;
struct LinkedNode *next;
} LinkedNode;
/*
* append the node with data at the end of the list
*/
void append(LinkedNode *head, char data)
{
LinkedNode *newNode = malloc(sizeof(LinkedNode));
newNode->data = data;
newNode->next = NULL;
LinkedNode *temp = head;
while (temp->next != NULL) {
temp = temp->next;
}
temp->next = newNode;
}
/*
* dump the data of the list
*/
void dump(LinkedNode *head)
{
for (head = head->next; head != NULL; head = head->next) {
printf("%c ", head->data);
}
printf("\n");
}
/*
* compare characters complementarily
* return 1 for matching pairs as A<=>T or C<=>G
*/
int compcomp(char a, char b)
{
if ((a == 'A' && b == 'T') || (a == 'T' && b == 'A')
|| (a == 'C' && b == 'G') || (a == 'G' && b == 'C'))
return 1;
else
return 0;
}
/*
* return the pointer to the last node if the subsequences match complementarily
*/
LinkedNode *match(LinkedNode *head, LinkedNode *bond)
{
for (; head != NULL; head = head->next, bond = bond->next) {
if (! compcomp(head->data, bond->data)) {
return NULL;
} else if (bond->next == NULL) {
return head;
}
}
return NULL;
}
/*
* free skipped nodes
*/
void freenodes(LinkedNode *head, LinkedNode *tail)
{
if (head->next == tail) { // the last node to remove
// printf("> %c\n", head->data); // for debugging
free(head);
return;
}
freenodes(head->next, tail); // free nodes recursively
// printf("> %c\n", head->data); // for debugging
free(head);
}
/*
* search "head" for the sequence of complementary of "bond". If matched,
* the matched portion is skipped and skipped nodes are freed
*/
LinkedNode *edit(LinkedNode *head, LinkedNode *bond)
{
LinkedNode *matched; // last node of matched sequence
LinkedNode *matchednext;
LinkedNode *tmp; // copy of head to traverse the list
for (tmp = head; tmp != NULL; tmp = tmp->next) {
if ((matched = match(tmp->next, bond->next))) {
matchednext = matched->next; // backup matched->next
freenodes(tmp->next, matched->next);
tmp->next = matchednext; // skip matched sequence
}
}
return head;
}
int main()
{
char dna_orig[] = "ACGTAGACGTTCTA";
char dna_bond[] = "GCA";
int i;
// generate list of original sequence
LinkedNode *head = malloc(sizeof(LinkedNode));
head->next = NULL;
for (i = 0; dna_orig[i] != 0; i ) {
append(head, dna_orig[i]);
}
// generate list of bonding sequence
LinkedNode *bond = malloc(sizeof(LinkedNode));
bond->next = NULL;
for (i = 0; dna_bond[i] != 0; i ) {
append(bond, dna_bond[i]);
}
// edit the original sequence and see the result
dump(edit(head, bond));
return 0;
}
BTW you can omit #include <stdio.h>
by dropping printf()
and fprintf()
functions which are used just for the reporting purpose.
[EDIT2]
The main difference between your code and mine is the data structure of
the linked list. The head
of my list has empty data and is the entrance
to the first node, while your dna_original
directly starts with the node
which contains the data. Same with bond
and sequencia
.
Both have its own advantages but we cannot mix them.
Then please modify your editar_dna
as:
LinkedNode *editar_dna(LinkedNode *dna_original, LinkedNode *seq_edicao){
LinkedNode *matched, *matchednext, *tmp;
for (tmp = dna_original; tmp != NULL; tmp = tmp->next) {
if ((matched = match(tmp->next, seq_edicao))) {
matchednext = matched->next;
liberar_nos(tmp->next, matched->next);
tmp->next = matchednext;
}
}
return dna_original;
}
changing seq_edicao->next
to seq_edicao
in match()
function. Then It will output A A A
.
The potential problem of your data structure is we cannot remove
the starting subsequence such as: "CGTACGTA"
. It is technically
possible to fix but may require additional huge modifications.
Now you have three options:
- Neglect the edge case starting with matched subsequence.
- Modify the linked list structure (easy but I'm not sure if it is acceptable.)
- Find a countermeasure keeping current data structure.
It's up to you. :)
[EDIT3]
Here is the version applying option 3 (w/o the modification of main) to your code (FYI):
#include <stdio.h>
#include <stdlib.h>
typedef struct LinkedNode LinkedNode;
struct LinkedNode {
char data;
LinkedNode *next;
};
void imprimir1(LinkedNode *inicio){
if(inicio == NULL){
return;
}
printf("%c", inicio->data);
if (inicio->next!=NULL) printf(" ");
else printf("\n");
imprimir1(inicio->next);
return;
}
void liberar_lista(LinkedNode *inicio){
if(inicio == NULL) return;
liberar_lista(inicio->next);
free(inicio);
}
LinkedNode *inserir_final_r(LinkedNode *inicio, char valor) {
if (inicio == NULL) {
LinkedNode *novo = malloc(sizeof(LinkedNode));
if (novo == NULL) return inicio;
novo->data = valor;
novo->next = NULL;
return novo;
}
inicio->next = inserir_final_r(inicio->next, valor);
return inicio;
}
void liberar_nos(LinkedNode *head, LinkedNode *tail) {
if (head == tail) {
return;
}
liberar_nos(head->next, tail);
free(head);
}
int compare(char a, char b) {
if ((a == 'A' && b == 'T') || (a == 'T' && b == 'A')
|| (a == 'C' && b == 'G') || (a == 'G' && b == 'C'))
return 1;
else
return 0;
}
LinkedNode *match(LinkedNode *head, LinkedNode *bond) {
for (; head != NULL; head = head->next, bond = bond->next) {
if (! compare(head->data, bond->data)) {
return NULL;
} else if (bond->next == NULL) {
return head;
}
}
return NULL;
}
LinkedNode *editar_dna(LinkedNode *dna_original, LinkedNode *seq_edicao){
LinkedNode *matched, *matchednext, *tmp;
// remove leading matched subsequence(s) as a preproces
while ((matched = match(dna_original, seq_edicao))) {
matchednext = matched->next;
liberar_nos(dna_original->next, matched->next); // note we cannot free the 1st node
dna_original->data = matchednext->data;
dna_original->next = matchednext->next;
liberar_nos(matchednext, matchednext->next); // free the copied node which is no longer used
}
/*
for (tmp = dna_original; tmp != NULL; tmp = tmp->next) {
if ((matched = match(tmp->next, seq_edicao))) {
matchednext = matched->next;
liberar_nos(tmp->next, matched->next);
tmp->next = matchednext;
}
}
*/
for (tmp = dna_original; tmp != NULL; ) {
if ((matched = match(tmp->next, seq_edicao))) {
matchednext = matched->next;
liberar_nos(tmp->next, matched->next);
tmp->next = matchednext; // skip matched sequence
} else {
tmp = tmp->next; // proceed to next node
}
}
return dna_original;
}
int main(){
//Creating empty nodes
LinkedNode *dna_original = NULL;
LinkedNode *sequencia = NULL;
//Populating test data into the original sequence
// dna_original = inserir_final_r(dna_original, 'A'); // dropped just for the test
dna_original = inserir_final_r(dna_original, 'C');
dna_original = inserir_final_r(dna_original, 'G');
dna_original = inserir_final_r(dna_original, 'T');
dna_original = inserir_final_r(dna_original, 'C');
dna_original = inserir_final_r(dna_original, 'G');
dna_original = inserir_final_r(dna_original, 'T');
dna_original = inserir_final_r(dna_original, 'A');
dna_original = inserir_final_r(dna_original, 'C');
dna_original = inserir_final_r(dna_original, 'G');
dna_original = inserir_final_r(dna_original, 'T');
dna_original = inserir_final_r(dna_original, 'A');
//Populating test data into the second sequence
sequencia = inserir_final_r(sequencia, 'G');
sequencia = inserir_final_r(sequencia, 'C');
sequencia = inserir_final_r(sequencia, 'A');
//Printing the original sequence before change
imprimir1(dna_original);
//Changing the sequence
editar_dna(dna_original, sequencia);
//Printing the original sequence after change
imprimir1(dna_original);
//Freeing allocated memory
liberar_lista(dna_original);
liberar_lista(sequencia);
return 0;
}
Please note the initialization of the list in main() is modified for the debugging and demonstration purpose.
CodePudding user response:
From the title and discussion I'm assuming that applying the edit should mutate the input list by removing matched subsequences. If the first node of the list is removed, you'll need to return a pointer to some later node. Otherwise the first node is returned.
The naive way to handle the two cases is with specific checks. But, there's a well-known pattern for handling both cases as one. It's pre-pending a "dummy" head node temporarily, which is never removed. Rather than try to explain further, I'll append code. See apply_edit
.
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
typedef struct node_s {
struct node_s *next;
char base;
} Node;
void print(Node *seq) {
for (Node *p = seq; p; p = p->next) printf("%c", p->base);
printf("\n");
}
bool base_match(char x, char y) {
switch (x) {
case 'A': return y == 'T';
case 'C': return y == 'G';
case 'G': return y == 'C';
case 'T': return y == 'A';
}
exit(42); // input error
}
// If the prefix of seq is as given, return the following seq node, else seq.
Node *remove_prefix(Node *prefix, Node *seq) {
Node *p, *s;
for (p = prefix, s = seq; p && s; p = p->next, s = s->next)
if (!base_match(p->base, s->base))
return seq;
return p ? seq : s;
}
Node *apply_edit(Node *edit, Node *seq) {
Node head[1] = {{ seq }};
for (Node *p = head; p && p->next; ) {
Node *next = remove_prefix(edit, p->next);
if (next == p->next) p = next; // No match; advance one.
else p->next = next; // Remove the match; don't advance.
}
return head->next;
}
Node *build_seq(char *gene) {
Node *seq = NULL;
for (int i = strlen(gene) - 1; i >= 0; --i) {
Node *node = malloc(sizeof *node);
node->base = gene[i];
node->next = seq;
seq = node;
}
return seq;
}
int main(void) {
// Provided test case. Expect AAGATCTA.
print(apply_edit(build_seq("GCA"), build_seq("ACGTAGACGTTCTA")));
// Remove from head. Expect A.
print(apply_edit(build_seq("GCA"), build_seq("CGTA")));
// Remove from tail. Expect A.
print(apply_edit(build_seq("GCA"), build_seq("ACGT")));
// Incomplete match at tail. Expect ACG
print(apply_edit(build_seq("GCA"), build_seq("ACG")));
// Remove all. Expect empty string.
print(apply_edit(build_seq("GCA"), build_seq("CGTCGT")));
// Remove creates new match, which is ignored. Expect CGT.
print(apply_edit(build_seq("GCA"), build_seq("CCGTGT")));
return 0;
}
Though it makes for code that's a bit harder to read, you can inline the helper function where it's called in apply_edit
and get something very concise:
Node *apply_edit(Node *edit, Node *seq) {
Node *e, *s, head[1] = {{ seq }};
for (Node *p = head; p && p->next; ) {
for (e = edit, s = p->next; e && s; e = e->next, s = s->next)
if (!base_match(e->base, s->base)) break;
if (e) p = s; // No match; advance one.
else p->next = s; // Remove the match; don't advance.
}
return head->next;
}
I'll also point out that traversing lists with recursion is something you wouldn't do in production code. While a good compiler with optimizations turned up will convert tail recursion (like yours) into iteration, it can be a finicky optimization ... as in the compiler will skip it under non-obvious circumstances. If that happens, your program needs stack space proportional to the length of lists you're traversing, while an iterative code will run fine in one stack frame. That's a big deal if your program needs to be reliable.