parse.c

 /***************************************************************************/
 /*                                                                         */
 /* This link grammar system is copyright (C) 1991-1997 by Daniel Sleator   */
 /* (sleator@cs.cmu.edu) and Davy Temperley.  This is work in progress.     */
 /* See http://www.cs.cmu.edu/~sleator for announcements of the latest      */
 /* version, detailed documentation, and rules for commercial use.          */
 /*                                                                         */
 /***************************************************************************/

#include "header.h"
#define INFINITY 1000

/* This file contains the exhaustive search algorithm. */

/* Important functions, and what they do.  Cost here always refers to the
   number of null-links used.

   int truecount(int lw, int rw, Connector *le, Connector *re);
       Returns the number of ways to parse the given region with the
       given connector lists.  It fills in the table with these
       counts as it computes them.  This is the same as count() in the
       old version of the parser.  No null links.

   int strictparse(void);
       Just like the old parse().  Returns the number of ways to parse
       the sentence by calling truecount().  No null links.

   int cost(int lw, int rw, Connector *le, Connector *re);
       Returns the cost of the minimum cost way of parsing the given region
       with the given connector lists.  Here cost is defined to be the number
       of null-links required.  It calls cost() recursively.
  
   int count(int lw, int rw, Connector *le, Connector *re);
       This returns the number of linkages for the given region whose cost
       is equal to the minimum cost.  It assumes (unnecessarily) that
       cost() has already been called for that region (and the cost values
       are already in the hash table).

   int calc_min_cost(void);
       This is the wrapper around cost() that computes the minimum cost way
       of parsing the sentence.

   int parse(int mincost);
       Returns the number of ways the sentence can be parsed with cost mincost.

*/

#define NO_ISLANDS TRUE

/* If this is FALSE, then linkages in which an island of words appear
   that are separate from the rest of the sentence are allowed.  If it
   is TRUE, then these are not allwed.  The current post processing
   system eliminates all linkages with islands, so for efficiency sake
   it makes sense to leave this set to TRUE.  */

int match(Connector *a, Connector *b) {
  /* 
     Returns TRUE if s and t match according to the connector matching
     rules.  The connector strings must be properly formed, starting with
     zero or more upper case letters, followed by some other letters, and
     The algorithm is symmetric with respect to a and b.
     
     It works as follows:  The labels must match.  The priorities must be
     compatible (both THIN_priority, or one UP_priority and one DOWN_priority).
     The sequence of upper case letters must match exactly.  After these comes
     a sequence of lower case letters "*"s or "^"s.  The matching algorithm
     is different depending on which of the two priority cases is being
     considered.  See the comments below.  */
    char *s, *t;
    int x, y;
    if (a->label != b->label) return FALSE;
    x = a->priority;
    y = b->priority;
    
    s = a->string;
    t = b->string;
    
    while(isupper(*s) || isupper(*t)) {
	if (*s != *t) return FALSE;
	s++;
	t++;
    }
    
    if ((x==THIN_priority) && (y==THIN_priority)) {
	/*
	   Remember that "*" matches anything, and "^" matches nothing
	   (except "*").  Otherwise two characters match if and only if
	   they're equal.  ("^" can be used in the dictionary just like
	   any other connector.)
	   */
	while ((*s!='\0') && (*t!='\0')) {
	    if ((*s == '*') || (*t == '*') ||
		((*s == *t) && (*s != '^'))) {
		s++;
		t++;
	    } else return FALSE;
	}
	return TRUE;
    } else if ((x==UP_priority) && (y==DOWN_priority)) {
	/*
	   As you go up (namely from x to y) the set of strings that
	   match (in the normal THIN sense above) should get no larger.
	   Read the comment in and.c to understand this.
	   In other words, the y string (t) must be weaker (or at least
	   no stronger) that the x string (s).
	   
	   This code is only correct if the strings are the same
	   length.  This is currently true, but perhaps for safty
	   this assumption should be removed.
	   */
	while ((*s!='\0') && (*t!='\0')) {
	    if ((*s == *t) || (*s == '*') || (*t == '^')) {
		s++;
		t++;
	    } else return FALSE;
	}
	return TRUE;
    } else if ((y==UP_priority) && (x==DOWN_priority)) {
	while ((*s!='\0') && (*t!='\0')) {
	    if ((*s == *t) || (*t == '*') || (*s == '^')) {
		s++;
		t++;
	    } else return FALSE;
	}
	return TRUE;
    } else return FALSE;
}

typedef struct Table_connector Table_connector;
struct Table_connector {
    short              lw, rw;
    Connector         *le, *re;
    short              cost;
    int                count;
    Table_connector   *next;
};

static int table_size;
static Table_connector ** table;

void init_table(void) {
    /* A piecewise exponential function determines the size of the hash table.      */
    /* Probably should make use of the actual number of disjuncts, rather than just */
    /* the number of words                                                          */
    int i;
    if (N_words >= 10) {
	table_size = (1<<16);
	/*  } else if (N_words >= 10) {
	    table_size = (1 << (((6*(N_words-10))/30) + 10)); */
    } else if (N_words >= 4) {
	table_size = (1 << (((6*(N_words-4))/6) + 4));
    } else {
	table_size = (1 << 4);
    }
    table = (Table_connector**) xalloc(table_size * sizeof(Table_connector*));
    for (i=0; i<table_size; i++) {
	table[i] = NULL;
    }
}

int hash(int lw, int rw, Connector *le, Connector *re, int cost) {
    int i;
    i = 0;
    
    i = i + (i<<1) + randtable[(lw + i) & (RTSIZE - 1)];
    i = i + (i<<1) + randtable[(rw + i) & (RTSIZE - 1)];
    i = i + (i<<1) + randtable[(((long) le + i) % (table_size+1)) & (RTSIZE - 1)];
    i = i + (i<<1) + randtable[(((long) re + i) % (table_size+1)) & (RTSIZE - 1)];
    i = i + (i<<1) + randtable[(cost+i) & (RTSIZE - 1)];
    return i & (table_size-1);
}

void free_table(void) {
    int i;
    Table_connector *t, *x;
    
    for (i=0; i<table_size; i++) {
	for(t = table[i]; t!= NULL; t=x) {
	    x = t->next;
	    xfree((void *) t, sizeof(Table_connector));
	}
    }
    xfree((void *) table, table_size * sizeof(Table_connector*));
}

Table_connector * table_pointer(int lw, int rw, Connector *le, Connector *re, int cost) {
    /* returns the pointer to this info, NULL if not there */
    Table_connector *t;
    N_hash_lookups++;
    work_in_hash_lookups++;
    t = table[hash(lw, rw, le, re, cost)];
    for (; t != NULL; t = t->next) {
	work_in_hash_lookups++;
	if ((t->lw == lw) && (t->rw == rw) && (t->le == le) && (t->re == re) && (t->cost == cost))  return t;
    }
    return NULL;
}

int table_lookup(int lw, int rw, Connector *le, Connector *re, int cost) {
    /* returns the count for this quintuple if there, -1 otherwise */    
    Table_connector *t = table_pointer(lw, rw, le, re, cost);

    if (t == NULL) return -1; else return t->count;
}

Table_connector * table_store(int lw, int rw, Connector *le, Connector *re, int cost, int count){
    /* Stores the value in the table.  Assumes it's not already there */
    Table_connector *t, *n;
    int h;
    N_in_table++;
    
    n = (Table_connector *) xalloc(sizeof(Table_connector));
    n->count = count; 
    n->lw = lw; n->rw = rw; n->le = le; n->re = re; n->cost = cost;
    h = hash(lw, rw, le, re, cost);
    t = table[h];
    n->next = t;
    table[h] = n;
    return n;
}

void table_update(int lw, int rw, Connector *le, Connector *re, int cost, int count) {
    /* Stores the value in the table.  Unlike table_store, it assumes it's already there */
    Table_connector *t = table_pointer(lw, rw, le, re, cost);
    
    assert(t != NULL, "This entry is supposed to be in the table.");
    t->count = count;
}


int pseudocount(int lw, int rw, Connector *le, Connector *re, int cost) {
/* returns 0 if and only if this entry is in the hash table with a count value of 0 */
    int count;
    count = table_lookup(lw, rw, le, re, cost);
    if (count == 0) return 0; else return 1;
}

int count(int lw, int rw, Connector *le, Connector *re, int cost) {
    Disjunct * d;
    int total, pseudototal;
    int start_word, end_word, w;
    int leftcount, rightcount;
    int lcost, rcost, Lmatch, Rmatch;

    Match_node * m, *m1;
    Table_connector *t;

    if (cost < 0) return 0;  /* will we ever call it with cost<0 ? */

    t = table_pointer(lw, rw, le, re, cost);

    if (t == NULL) {
	t = table_store(lw, rw, le, re, cost, 0);  /* create the table entry with a tentative cost of 0 */
	                                           /* this cost must be updated before we return */
    } else {
	return t->count;
    }
    
    if (rw == 1+lw) {
	/* lw and rw are neighboring words */
	/* you can't have a linkage here with cost > 0 */
	if ((le == NULL) && (re == NULL) && (cost == 0)) {
	    t->count = 1;
	} else {
	    t->count = 0;
	}
	return t->count;
    }
    
    if ((le == NULL) && (re == NULL)) {
#if NO_ISLANDS
	/* if we don't allow islands (a set of words linked together but
	   separate from the rest of the sentence) then  the cost of skipping
	   n words is just n */
	if (cost == rw-lw-1) {
	    t->count = 1;
	} else {
	    t->count = 0;
	}
	return t->count;
#else
	if (cost == 0) {
	    /* there is no zero-cost solution in this case */
	    /* slight efficiency hack to separate this cost=0 case out */
	    /* but not necessary for correctness */
	    t->count = 0;
	} else {
	    total = 0;
	    w = lw+1;
	    for (d = sentence[w].d; d != NULL; d = d->next) {
		if (d->left == NULL) {
		    total += count(w, rw, d->right, NULL, cost-1);
		}
	    }
	    total += count(w, rw, NULL, NULL, cost-1);
	    t->count = total;
	}
	return t->count;
#endif	    
    }
    
    if (le == NULL) {
	start_word = lw+1;
    } else {
	start_word = le->word;
    }

    if (re == NULL) {
	end_word = MIN(rw-1, lw+maxlinklength); 
    } else {
	end_word = re->word;
    }

    total = 0;
    
    for (w=start_word; w <= end_word; w++) {
	m1 = m = form_match_list(w, le, lw, re, rw); 
	for (; m!=NULL; m=m->next) {
	    d = m->d;
	    for (lcost = 0; lcost <= cost; lcost++) {
		rcost = cost-lcost;
		/* now lcost and rcost are the costs we're assigning to those parts respectively */


		/* Now, we determine if (based on table only) we can see that
		   the current range is not parsable. */

		Lmatch = (le != NULL) && (d->left != NULL) && match(le, d->left);
		Rmatch = (d->right != NULL) && (re != NULL) && match(d->right, re);

		rightcount = leftcount = 0;
		if (Lmatch) {
		    leftcount = pseudocount(lw, w, le->next, d->left->next, lcost);
		    if (le->multi) leftcount += pseudocount(lw, w, le, d->left->next, lcost);
		    if (d->left->multi) leftcount += pseudocount(lw, w, le->next, d->left, lcost);
		    if (le->multi && d->left->multi) leftcount += pseudocount(lw, w, le, d->left, lcost);
		}
		
		if (Rmatch) {
		    rightcount = pseudocount(w, rw, d->right->next, re->next, rcost);
		    if (d->right->multi) rightcount += pseudocount(w,rw,d->right,re->next, rcost);
		    if (re->multi) rightcount += pseudocount(w, rw, d->right->next, re, rcost);
		    if (d->right->multi && re->multi) rightcount += pseudocount(w, rw, d->right, re, rcost);
		}

		pseudototal = leftcount*rightcount;  /* total number where links are used on both sides */

		if (leftcount > 0) {
		    /* evaluate using the left match, but not the right */
		    pseudototal += leftcount * pseudocount(w, rw, d->right, re, rcost);
		}
		if ((le == NULL) && (rightcount > 0)) {
		    /* evaluate using the right match, but not the left */
		    pseudototal += rightcount * pseudocount(lw, w, le, d->left, lcost);
		}

		/* now pseudototal is 0 implies that we know that the true total is 0 */
		if (pseudototal != 0) {
		    rightcount = leftcount = 0;
		    if (Lmatch) {
			leftcount = count(lw, w, le->next, d->left->next, lcost);
			if (le->multi) leftcount += count(lw, w, le, d->left->next, lcost);
			if (d->left->multi) leftcount += count(lw, w, le->next, d->left, lcost);
			if (le->multi && d->left->multi) leftcount += count(lw, w, le, d->left, lcost);
		    }

		    if (Rmatch) {
			rightcount = count(w, rw, d->right->next, re->next, rcost);
			if (d->right->multi) rightcount += count(w,rw,d->right,re->next, rcost);
			if (re->multi) rightcount += count(w, rw, d->right->next, re, rcost);
			if (d->right->multi && re->multi) rightcount += count(w, rw, d->right, re, rcost);
		    }

		    total += leftcount*rightcount;  /* total number where links are used on both sides */

		    if (leftcount > 0) {
			/* evaluate using the left match, but not the right */
			total += leftcount * count(w, rw, d->right, re, rcost);
		    }
		    if ((le == NULL) && (rightcount > 0)) {
			/* evaluate using the right match, but not the left */
			total += rightcount * count(lw, w, le, d->left, lcost);
		    }
		}
	    }
	}
	    
	put_match_list(m1);
    }
    t->count = total;
    return total;
}

/*
To: Danny Sleator <sleator@CS.cmu.edu>
From: John Lafferty <lafferty@CS.cmu.edu>
Subject: improved parse procedure
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Date: Mon, 08 Jan 1996 19:13:23 -0500
Message-ID: <27437.821146403@HARROLD.IS.CS.CMU.EDU>
Sender: John_Lafferty@HARROLD.IS.CS.CMU.EDU


Danny,

Here's another, equivalent way of writing the "parse" procedure in your
version of the code with null links (e.g., the release version):

int parse(int cost) {
    return count(-1, N_words, NULL, NULL, cost+1);
}

I'm making pretty good progress on writing the inside/outside
training.  I have the basic routines written and working with 
dummy procedures for the probabilities (but these procedures 
are the hard part!).

John
*/

int parse(int cost) {
    /* Returns the number of ways the sentence can be parsed with the
       specified cost Assumes that the hash table has already been
       initialized, and is freed later.  */
    
    Disjunct * dis;
    int total=0;
    
    for (dis = sentence[0].d; dis != NULL; dis = dis->next) {
	if (dis->left == NULL) {
	    total += count(0, N_words, dis->right, NULL, cost);
	}
    }

    /* the following allows the possibility of a null link used on the leftmost word */
    total += count(0, N_words, NULL, NULL, cost);
    
    if (verbosity > 1) {
	printf("Total count with cost %d:   %d\n", cost, total);
    }
    return total;
}

/*
   
   CONJUNCTION PRUNING.
   
   The basic idea is this.  Before creating the fat disjuncts,
   we run a modified version of the exhaustive search procedure.
   Its purpose is to mark the disjuncts that can be used in any
   linkage.  It's just like the normal exhaustive search, except that
   if a subrange of words are deletable, then we treat them as though
   they were not even there.  So, if we call the function in the
   situation where the set of words between the left and right one
   are deletable, and the left and right connector pointers
   are NULL, then that range is considered to have a solution.
   
   There are actually two procedures to implement this.  One is
   mark_region() and the other is region_valid().  The latter just
   checks to see if the given region can be completed (within it).
   The former actually marks those disjuncts that can be used in
   any valid linkage of the given region.
   
   As in the standard search procedure, we make use of the fast-match
   data structure (which requires power pruning to have been done), and
   we also use a hash table.  The table is used differently in this case.
   The meaning of values stored in the table are as follows:
   
   -1  Nothing known (Actually, this is not stored.  It's returned
   by table_lookup when nothing is known.)
   0  This region can't be completed (marking is therefore irrelevant)
   1  This region can be completed, but it's not yet marked
   2  This region can be completed, and it's been marked.
   */  

int region_valid(int lw, int rw, Connector *le, Connector *re) {
    /* Returns 0 if this range cannot be successfully filled in with           */
    /* links.  Returns 1 if it can, and it's not been marked, and returns      */
    /* 2 if it can and it has been marked.                                     */
    
    Disjunct * d;
    int left_valid, right_valid, found;
    int i, start_word, end_word;
    int w;
    Match_node * m, *m1;
    
    i = table_lookup(lw, rw, le, re, 0);
    if (i >= 0) return i;
    
    if ((le == NULL) && (re == NULL) && deletable[lw][rw]) {
	table_store(lw, rw, le, re, 0, 1);
	return 1;
    }
    
    if (le == NULL) {
	start_word = lw+1;
    } else {
	start_word = le->word;
    }
    if (re == NULL) {
	end_word = MIN(rw-1, lw+maxlinklength);
    } else {
	end_word = re->word;
    }
    
    found = 0;
    
    for (w=start_word; w <= end_word; w++) {
	m1 = m = form_match_list(w, le, lw, re, rw); 
	for (; m!=NULL; m=m->next) {
	    d = m->d;
	    mark_cost++;
	    /* in the following expressions we use the fact that 0=FALSE. Could eliminate
	       by always saying "region_valid(...) != 0"  */
	    left_valid = (((le != NULL) && (d->left != NULL) && prune_match(le, d->left)) &&
			  ((region_valid(lw, w, le->next, d->left->next)) ||
			   ((le->multi) && region_valid(lw, w, le, d->left->next)) ||
			   ((d->left->multi) && region_valid(lw, w, le->next, d->left)) ||
			   ((le->multi && d->left->multi) && region_valid(lw, w, le, d->left))));
	    if (left_valid && region_valid(w, rw, d->right, re)) {
		found = 1;
		break;
	    }
	    right_valid = (((d->right != NULL) && (re != NULL) && prune_match(d->right, re)) &&
			   ((region_valid(w, rw, d->right->next,re->next))    ||
			    ((d->right->multi) && region_valid(w,rw,d->right,re->next))  ||
			    ((re->multi) && region_valid(w, rw, d->right->next, re))  ||
			    ((d->right->multi && re->multi) && region_valid(w, rw, d->right, re))));
	    if ((left_valid && right_valid) || (right_valid && region_valid(lw, w, le, d->left))) {
		found = 1;
		break;
	    }
	}
	put_match_list(m1);
	if (found != 0) break;
    }
    table_store(lw, rw, le, re, 0, found);
    return found;
}

void mark_region(int lw, int rw, Connector *le, Connector *re) {
    /* Mark as useful all disjuncts involved in some way to complete the structure  */
    /* within the current region.  Note that only disjuncts strictly between        */
    /* lw and rw will be marked.  If it so happens that this region itself is not   */
    /* valid, then this fact will be recorded in the table, and nothing else happens*/    
    
    Disjunct * d;
    int left_valid, right_valid, i;
    int start_word, end_word;
    int w;
    Match_node * m, *m1;
    
    i = region_valid(lw, rw, le, re);
    if ((i==0) || (i==2)) return;
    /* we only reach this point if it's a valid unmarked region, i=1 */
    table_update(lw, rw, le, re, 0, 2);
    
    if ((le == NULL) && (re == NULL) && (null_links) && (rw != 1+lw)) {
	w = lw+1;
	for (d = sentence[w].d; d != NULL; d = d->next) {
	    if ((d->left == NULL) && region_valid(w, rw, d->right, NULL)) {
		d->marked = TRUE;
		mark_region(w, rw, d->right, NULL);
	    }
	}
	mark_region(w, rw, NULL, NULL);
	return;
    }
    
    if (le == NULL) {
	start_word = lw+1;
    } else {
	start_word = le->word;
    }
    if (re == NULL) {
	end_word = MIN(rw-1, lw+maxlinklength);
    } else {
	end_word = re->word;
    }
    
    for (w=start_word; w <= end_word; w++) {
	m1 = m = form_match_list(w, le, lw, re, rw); 
	for (; m!=NULL; m=m->next) {
	    d = m->d;
	    mark_cost++;
	    left_valid = (((le != NULL) && (d->left != NULL) && prune_match(le, d->left)) &&
			  ((region_valid(lw, w, le->next, d->left->next)) ||
			   ((le->multi) && region_valid(lw, w, le, d->left->next)) ||
			   ((d->left->multi) && region_valid(lw, w, le->next, d->left)) ||
			   ((le->multi && d->left->multi) && region_valid(lw, w, le, d->left))));
	    right_valid = (((d->right != NULL) && (re != NULL) && prune_match(d->right, re)) &&
			   ((region_valid(w, rw, d->right->next,re->next)) ||
			    ((d->right->multi) && region_valid(w,rw,d->right,re->next))  ||
			    ((re->multi) && region_valid(w, rw, d->right->next, re)) ||
			    ((d->right->multi && re->multi) && region_valid(w, rw, d->right, re))));
	    
	    /* The following if statements could be restructured to avoid superfluous calls
	       to mark_region.  It didn't seem a high priority, so I didn't optimize this.
	       */
	    
	    if (left_valid && region_valid(w, rw, d->right, re)) {
		d->marked = TRUE;
		mark_region(w, rw, d->right, re);
		mark_region(lw, w, le->next, d->left->next);
		if (le->multi) mark_region(lw, w, le, d->left->next);
		if (d->left->multi) mark_region(lw, w, le->next, d->left);
		if (le->multi && d->left->multi) mark_region(lw, w, le, d->left);
	    }
	    
	    if (right_valid && region_valid(lw, w, le, d->left)) {
		d->marked = TRUE;
		mark_region(lw, w, le, d->left);
		mark_region(w, rw, d->right->next,re->next);
		if (d->right->multi) mark_region(w,rw,d->right,re->next);
		if (re->multi) mark_region(w, rw, d->right->next, re);
		if (d->right->multi && re->multi) mark_region(w, rw, d->right, re);
	    }
	    
	    if (left_valid && right_valid) {
		d->marked = TRUE;
		mark_region(lw, w, le->next, d->left->next);
		if (le->multi) mark_region(lw, w, le, d->left->next);
		if (d->left->multi) mark_region(lw, w, le->next, d->left);
		if (le->multi && d->left->multi) mark_region(lw, w, le, d->left);
		mark_region(w, rw, d->right->next,re->next);
		if (d->right->multi) mark_region(w,rw,d->right,re->next);
		if (re->multi) mark_region(w, rw, d->right->next, re);
		if (d->right->multi && re->multi) mark_region(w, rw, d->right, re);
	    }
	}
	put_match_list(m1);
    }
}

void conjunction_prune(void) {
    /*
       We've already built the sentence disjuncts, and we've pruned them
       and power_pruned(GENTLE) them also.  The sentence contains a
       conjunction.  deletable[][] has been initialized to indicate the
       ranges which may be deleted in the final linkage.
       
       This routine deletes irrelevant disjuncts.  It finds them by first
       marking them all as irrelevant, and then marking the ones that
       might be useable.  Finally, the unmarked ones are removed.
       
       */
    Disjunct * d, * d_head, * dx;
    int w;
    
    /* we begin by unmarking all disjuncts.  This would not be necessary if
       whenever we created a disjunct we cleared its marked field.
       I didn't want to search the program for all such places, so
       I did this way.
       */
    for (w=0; w<N_words; w++) {
	for (d=sentence[w].d; d != NULL; d=d->next) {
	    d->marked = FALSE;
	}
    }
    
    init_fast_matcher();
    init_table();
    
    for (d = sentence[0].d; d != NULL; d = d->next) {
	if ((d->left == NULL) && region_valid(0, N_words, d->right, NULL)) {
	    mark_region(0, N_words, d->right, NULL);
	    d->marked = TRUE;
	}
    }
    
    for (w=0; w<N_words; w++) {
	d_head = NULL;
	for (d=sentence[w].d; d != NULL; d=dx) {
	    dx = d->next;
	    if (d->marked) {
		d->next = d_head;
		d_head = d;
	    } else {
		d->next = NULL;
		free_disjuncts(d);
	    }
	}
	sentence[w].d = d_head;
    }
    
    free_fast_matcher();
    free_table();    
}