dir_acnuc.h : main acnuc header file

#ifndef DIR_ACNUC_H
#define DIR_ACNUC_H

#include "dir_io.h"
#ifndef FALSE
#define FALSE 0
#define TRUE (!FALSE)
#endif


/*			API TO RECORDS OF ACNUC INDEX FILES 	*/

extern int L_MNEMO; 
extern struct rsub {         /* SUBSEQ   one record for each parent or sub- sequence */
	int length, /* seq length; or 0 if record was deleted */
	   type, /* to SMJYT, for seq type */
	   pext, /* if > 0 this is a subsequence, pext points to EXTRACT for list of exons;
	   	    if <= 0 this is a parent sequence, -pext points to LONGL for list of subseqs */
	   plkey, /* to SHORTL for list of keywords */
	   plinf, /* if parent sequence, plinf points to LOCUS for corresponding record;
	   	     if subsequence, plinf points to SHORTL for list of address of start of annotations; 
	   	     	this list contains only one element to be combined with the division rank
	   	     	for access to annotations */
	   phase, /* 100 * code_number + reading_frame_0_1_2 */
	   h; /* to SUBSEQ for next record with same hashing value or 0  */
	char name[1]; /* seq name padded by blanks to L_MNEMO chars */
	} *psub; 
#define lrsubptrs 7*sizeof(int)
#define readsub(x) if(read_record_with_key(ksub, x, psub, psub->name, L_MNEMO)) dir_readerr(ksub,x)
#define writesub(x) if(write_record_with_key(ksub, x, psub, psub->name, L_MNEMO)) dir_writeerr(ksub,x)

extern struct rloc {         /* LOCUS    one record for each parent sequence */
	int sub, /* to SUBSEQ for corresponding record; or 0 if record was deleted */
	    pnuc, /* offset within flat file of beginning of sequence lines */
	    pinf, /* offset within flat file of beginning of annotation lines */
	    pnuc2,pinf2, /* high-weight parts of offsets to sequence and annotation lines */
	    spec, /* if genbank of embl, to SPECIES for corresponding species
	    	     if swissprot or nbrf, to SHORTL for list of corresponding species */
	    host, /* to SPECIES, for host species of virus or plasmid, or 0 */
	    plref, /* to SHORTL for list of references */
	    molec, /* to SMJYT for molecule */
	    placc, /* to SHORTL for list of accession numbers */
	    stat, /* to SMJYT for status */
	    org, /* to SMJYT for organelle, or 0 if none */
	    div; /* gives rank of corresponding division, useful for div argument of read_annots64 */
	char date[11]; /* seq date in format DD-MMM-YYYY  
                        when hoffst <= 2, date in LOCUS file is MM/DD/YYMM/DD/YY */
	} *ploc;
#define lrlocptrs 13*sizeof(int)
#define readloc(x) if(read_locus_record(kloc, x, ploc)) dir_readerr(kloc,x)
#define writeloc(x) if(write_locus_record(kloc, x, ploc)) dir_writeerr(kloc,x)

extern int WIDTH_KW, WIDTH_SP;
/* KEYWORDS/SPECIES one record for each keyword/species, number 2 is RACINE (tree root) */
extern struct rkwsp {    
	int libel, /* to TEXT for corresponding label, or 0 if none */
	    plsub, /* to LONGL for list of associated seqs, plsub = 0 for synonyms */
	    desc, /* to SHORTL for list for descendants in tree structure;
	          the absolute value of the first elt of this list is the rank of corresponding
	          record in KEYWORDS/SPECIES; the sign of this number is negative iff there are
	          sequences associated to this record; other elements of list are "desc" values 
	          of records of descendants in tree; desc = 0 for synonyms */
	    syno, /* to KEYWORDS/SPECIES for next synonymous keyword, or 0 if none (major has value < 0) */
	    h, /* to KEYWORDS/SPECIES for next record with same hashing value, or 0 */
	    /* used in SPECIES only, to LONGL for list of seqs for which this species is a host, 0 for synos */
	    plhost;  
	char name[1];  /* name padded by spaces or truncated to WIDTH_KW/WIDTH_SP chars */
	} *pkey, *pspec;
#define lrkeyptrs 5*sizeof(int)
#define lrspecptrs 6*sizeof(int)
extern struct rkwsp *readkwsp(DIR_FILE *kan, int recnum, int *pwidth); /* reads indifferently in SPECIES/KEYWORDS */
/* writes indifferently to SPECIES/KEYWORDS */
#define writekwsp(kan, recnum, p, w) write_record_with_key(kan, recnum, p, (p)->name, w )
#define readkey(x) if(read_record_with_key(kkey, x, pkey, pkey->name, WIDTH_KW)) dir_readerr(kkey,x)
#define writekey(x) if(write_record_with_key(kkey, x, pkey, pkey->name, WIDTH_KW)) dir_writeerr(kkey,x)
#define readspec(x) if(read_record_with_key(kspec, x, pspec, pspec->name, WIDTH_SP)) dir_readerr(kspec,x)
#define writespec(x) if(write_record_with_key(kspec, x, pspec, pspec->name, WIDTH_SP)) dir_writeerr(kspec,x)

extern struct rshrt {  /* SHORTL 
		record #2 contains values of the hashing parameters as -hsub , -hkwsp;
		record #3 gives length of SMJYT names and of TEXT labels;
		hashing data for seq names, keywords and species are stored from record #4 
		linked list of species and keywords descendants
		*/
	int val, /* value of an element of the short list */
	    next; /* to SHORTL for next element of the short list, or 0 when list is finished */
	} *pshrt;
#define lrshrt sizeof(struct rshrt)
#define readshrt(x) read_shortl_record(kshrt, x, pshrt)
#define writeshrt(x) write_shortl_record(kshrt, x, pshrt)

extern int VALINSHRT2;
extern struct rshrt2 {  /* SHORTL2 series of linked records containing various values */
	unsigned next; /* to SHORTL2 for next element of the short list, or 0 when list is finished */
	int val[1]; /* array of VALINSHRT2 values or of 0s */
	} *pshrt2;
#define readshrt2(x) read_shortl2_record(kshrt2, (unsigned)(x), pshrt2)
#define writeshrt2(x) write_shortl2_record(kshrt2, (unsigned)(x), pshrt2)

enum shortl_kind { to_shortl = 0, sub_of_bib, spec_of_loc, bib_of_loc, aut_of_bib, bib_of_aut, sub_of_acc,
  key_of_sub, acc_of_loc };
extern unsigned follow_shortl(int *p_point, enum shortl_kind slkind, int *p_rank);

extern int SUBINLNG;
extern struct rlng {         /* LONGL series of linked records containing lists of SUBSEQ ranks */
	int next; /* to LONGL for next element of the long list, or 0 when list is finished */
	int sub[1]; /* array of SUBINLNG ranks of SUBSEQ records or of 0s */
	} *plng;
#define readlng(x) read_longl_record(klng,x,plng)
#define writelng(x) write_longl_record(klng,x,plng)

extern struct rext {     /* EXTRACT  series of linked records, one for each exon of each subseq */
	int mere, /* to SUBSEQ for rank of parent seq containing this exon */
	    deb,fin, /* start and end positions of exon within parent sequence */
	    next; /* to EXTRACT for next exon of same subseq , or 0 */
	} *pext;
#define readext(x) if(read_extract_record(kext, x, pext)) dir_readerr(kext,x)
#define writeext(x) if(write_extract_record(kext, x, pext)) dir_writeerr(kext,x)

extern int WIDTH_SMJ;
extern struct rsmj {/* SMJYT one record for each Status, Molec, Journal, Year, Type, organ, div */
	int plong, /* to LONGL for list of associated seqs */
	    libel; /* to TEXT for corresponding label, or 0 if none */
	char name[1]; /* first 2 chars 00, 01,... , 07 give nature of record, others give name 
	                 padded by spaces to WIDTH_SMJ chars */
	} *psmj;
#define lrsmjptrs 2*sizeof(int)
#define readsmj(x) if(read_record_with_key(ksmj, x, psmj, psmj->name, WIDTH_SMJ)) dir_readerr(ksmj,x)
#define writesmj(x) if(write_record_with_key(ksmj, x, psmj, psmj->name, WIDTH_SMJ)) dir_writeerr(ksmj,x)

extern int WIDTH_AUT;
extern struct raut {       /* AUTHOR  one record for each author name (initials ignored) */
	int plref; /* to SHORTL for list of references this author belongs to */
	char name[1]; /* author name padded to WIDTH_AUT with spaces; if "xxx...xxx" record was deleted */
	} *paut;
#define readaut(x) if(read_record_with_key(kaut, x, paut, paut->name, WIDTH_AUT)) dir_readerr(kaut,x)
#define writeaut(x) if(write_record_with_key(kaut, x, paut, paut->name, WIDTH_AUT)) dir_writeerr(kaut,x)

extern int WIDTH_BIB;
extern struct rbib {       /* BIBLIO one record for each reference, book, thesis, or unpublished */
	int plsub, /* to SHORTL for list of associated parent sequences */
	    plaut, /* to SHORTL for list of associated authors (only 1 for book,thesis,unpubl) */
	    j, /* to SMJYT for rank of corresponding journal, or generic values BOOK, THESIS */
	    y; /* to SMJYT for rank of publication year or for rank of year 0 if unknown */
	char name[1]; /* reference name padded by spaces or truncated to WIDTH_BIB chars 
		journal citations appear as JOURNALCODE/vol/first_page
		book citations as BOOK/year/first_author
		theses citations as THESIS/year/first_author
		patent citations as PATENT/number
		other citations as UNPUBL/year/first_author
		*/
	} *pbib;
#define lrbibptrs 4*sizeof(int)
#define readbib(x) if(read_record_with_key(kbib, x, pbib, pbib->name, WIDTH_BIB)) dir_readerr(kbib,x)
#define writebib(x) if(write_record_with_key(kbib, x, pbib, pbib->name, WIDTH_BIB)) dir_writeerr(kbib,x)

extern int lrtxt;
/* TEXT  taxa and keywords can have labels; elts of SMJYT have labels */
extern char *ptxt; /* a label padded to lrtxt chars WITHOUT \n or \0 at end */
#define readtxt(x) if(dir_read(ktxt,x,1,ptxt)!=1) dir_readerr(ktxt,x)
#define writetxt(x) if(dir_write(ktxt,x,1,ptxt)) dir_writeerr(ktxt,x)

extern int ACC_LENGTH;
extern struct racc {     /* ACCESS  one record for each accession number */
	int plsub; /* to SHORTL for list of associated parent seqs */
	char name[1]; /* real size is ACC_LENGTH (global variable) padded by spaces */
	} *pacc;
#define lraccptrs sizeof(int)
#define readacc(x) \
   if(read_record_with_key(kacc, x, pacc, pacc->name, ACC_LENGTH)) dir_readerr(kacc,x); \
   else pacc->name[ACC_LENGTH] = 0
#define writeacc(x) if(write_record_with_key(kacc, x, pacc, pacc->name, ACC_LENGTH)) dir_writeerr(kacc,x)

extern struct rinfo{	/* to access seq annotations, always use read_annots64/next_annots64 */
        char line[256]; /* to hold one line of annotations */
        } *pinfo;
#define lrinfo sizeof(struct rinfo)

#define WIDTH_MAX 150
#define ACN_REC_MAX 256 /* bytes; must be multiple of sizeof(int) */


/* file- and record-variables independant API
for SUBSEQ,KEYWORDS,SPECIES,ACCESS,AUTHOR,BIBLIO,SMJYT:
int {read,write}_record_with_key(kan, recnum, pointer, name, width)
for LONGL:
int {read,write}_longl_record(kan, recnum, pointer)
for SHORTL:
int {read,write}_shortl_record(kan, recnum, pointer)
for LOCUS
int {read,write}_locus_record(kan, recnum, pointer)
for EXTRACT
int {read,write}_extract_record(kan, recnum, pointer)
*/
#ifndef ENDIAN_SENSITIVE
#define read_shortl_record(k, x, p) read_endian_insensitive(k, x, p)
#define write_shortl_record(k, x, p) write_endian_insensitive(k, x, p)
#else
#define read_shortl_record(k, x, p) (dir_read(k, x, 1, p) != 1)
#define write_shortl_record(k, x, p) dir_write(k, x, 1, p)
#endif
extern int read_locus_record(DIR_FILE *kan, int numrec, struct rloc *p);
extern int write_locus_record(DIR_FILE *kan, int numrec, struct rloc *p);
extern int read_extract_record(DIR_FILE *kan, int numrec, struct rext *p);
extern int write_extract_record(DIR_FILE *kan, int numrec, struct rext *p);
extern int read_record_with_key(DIR_FILE *kan, int recnum, void *p, char *name, int widthkey);
extern int write_record_with_key(DIR_FILE *kan, int recnum, void *p, char *name, int widthkey );
extern int read_endian_insensitive(DIR_FILE *kan, int numrec, void *buffer);
extern int write_endian_insensitive(DIR_FILE *kan, int numrec, void *buffer);
extern int read_longl_record(DIR_FILE *kan, int recnum, struct rlng *p);
extern int write_longl_record(DIR_FILE *kan, int recnum, struct rlng *p);
extern int read_shortl2_record(DIR_FILE *kan, unsigned recnum, struct rshrt2 *p);
extern int write_shortl2_record(DIR_FILE *kan, unsigned recnum, struct rshrt2 *p);

enum endianness {big_endian, little_endian};

extern int must_swap_bytes;
extern int lmot,hoffst,hsub,hkwsp,nseq,nbrf,lenbit,lenw,maxa,longa;
extern DIR_FILE *ksub,*kloc,*kkey,*kspec,*kbib,*kacc,*ktxt,*ksmj,*kext,
*kaut,*kshrt,*kshrt2,*klng;
extern int nbmrfa,flat_format,gcgcod,unixos,genbank,embl,divisions,swissprot,
	big_annots;
extern char nucbuf[]; /* to hold in memory the last part of sequence read */
extern char **gcgname; /* names of division files */
extern int *annotopened; /* says whether any division is currently opened */
extern FILE **divannot; /* streams associated to opened divisions */


/* prototypes for acnuc access*/
#define acnucopen() dir_acnucopen("RO")  /* open the full acnuc data base in readonly mode */
void quick_list_meres(int *blist); /* puts in blist of length lenw list of parent sequences */
void simpleopen(void); /* open acnuc using only sequences and annotations */
void dir_acnucclose(void);  /* close it */
	/* get seq number, length, reading frame and genetic code from name */
int gsnuml(char *name,  int *length, int *frame, int *gencode);
int gfrag(int nsub,int first,int lfrag,char *dseq); /*read part of a sequence*/
void seq_to_annots64(int numseq, off_t *paddr, int *div);
char *read_annots64(off_t addr, int div);
char *next_annots64(off_t *paddr);
char *short_descr(int seqnum, char *text, int maxlen);/* get short description */
char *short_descr_p(int seqnum, char *text, int maxlen);/*get parent's descrip*/
char codaa(char *codon,int code);/*translate a codon with correct genetic code*/
char *translate_cds(int seqnum); /*complete translation of a seq in dyn memory*/
char *get_code_descr(int code);/*get short description of variant genetic code*/
int fcode(DIR_FILE *fp, char *search, int lcompar); /*search key in index file*/
int isenum(char *name); /*get sequence number from its name;upper or lowercase*/
int iknum(char *name, DIR_FILE *fp); /*get species or keyword number from name */
int hashmn(char *nom);  /* used by isenum */
int hasnum(char *name, int width);  /* used by iknum */
void lngbit(int point, int *blist); /* read a long list in bit list in memory */
void dir_readerr(DIR_FILE *fich, int recnum);  /*write message after read err*/
int decode_locus_rec(char *line, char **pname, char **pmolec, 
	int *circular, char **pdivision, char **pdate);
/* for any acnuc index file fp, returns the rank of last valid record;
if endsort != NULL, stores in *endsort the rank of the last alphabetically sorted record  */
int read_first_rec(DIR_FILE *fp, int *endsort); 
void descen(DIR_FILE *kan, int recnum, int *blist);


/* prototypes of utility functions */
char complementer_base(char nucl); /* returns Watson-Crick complement of nucl */
void complementer_seq(char *seq, int len); /*in place complement strand of seq*/
void padtosize(/*out*/ char *paddedname, /*in*/ char *name, int size); /*add trailing spaces */
/* alphabetically compares l1 first chars of s1 and l2 of s2 ignoring trailing spaces
and returns 0 iff strings are equal */
int strcmptrail(char *s1, int l1, char *s2, int l2); 
void compact(char *chaine); /* removes all spaces in place */
void majuscules(char *name); /* converts to uppercase in place */
int trim_key(char *name); /* removes trailing spaces */
enum endianness endian_test(void);
off_t pair_to_offt(int low, int high);
void offt_to_pair(off_t addr, int *plow, int *phigh);
char *emul_gets(char *str);
#define gets(x) emul_gets(x)


/* prototypes for acnuc management*/
void dir_acnucopen(char *); /* use RO or WP or WA to control file access */
void dir_acnucflush(void);  /* flush all data base files */
void dir_writeerr(DIR_FILE *fich, int recnum);/*write message after write err*/
void write_first_rec(DIR_FILE *fp, int total, int endsort);/*update file size*/
         /* modify short list */
int mdshrt(DIR_FILE *kan, int nrec, int offset, int val, int *newplist);
	/* modify long list */
int mdlng(DIR_FILE *kan, int nrec, int offset, int val, int *newplist);
int supshrt(int point, int val); 
int addshrt(int point, int val); 
int addshrt2(DIR_FILE *k, struct rshrt2 *pk, unsigned point, int val);
int supshrt2(DIR_FILE *k, struct rshrt2 *pk, unsigned point, int val);
#define add_shortl(point, val, slkind)  ( (slkind > 0 && VALINSHRT2) ? \
    addshrt2(kshrt2, pshrt2, (unsigned)(point), val)  :  addshrt(point, val) )
#define sup_shortl(point, val, slkind)  ( (slkind > 0 && VALINSHRT2) ? \
    supshrt2(kshrt2, pshrt2, (unsigned)(point), val)  :  supshrt(point, val) )
int addlng(int point, int val); 
int suplng(int point, int val); 
void delseq(int nsub);  /* remove sequence from data base */
void suphsh(int numrec, DIR_FILE *kan); /* remove key from hash chains */
void addhsh(int recnum, DIR_FILE *kan); /* add key to hash chains */
/* call when kacc, kaut, kbib or ksmj were updated */
void write_sorted_part(DIR_FILE *fp); 
/* to create a species and/or get its number, use ascend=NULL for a new root*/
int crespecies(char *ascend, char *nom);
/* to create a keyword and/or get its number, use ascend=NULL for a new root*/
int crekeyword(char *ascend, char *nom);
/* x must be an int or unsigned l-value */
#define swap4b(x) x = ((unsigned)(x)>>24) | ((x)<<24) | (((x)&0xFF00)<<8) | (((x)&0xFF0000)>>8)

/* prototypes of bit-manipulation functions */
int irbit(int *pdeblist, int deb, int fin);
void bit1(int *plist, int num);
void bit0(int *plist, int num);
int testbit(int *plist, int num);
int bcount(int *plist, int last);
void et(int *listet, int *list1, int *list2, int len);
void ou(int *listou, int *list1, int *list2, int len);
void non(int *listnon, int *list, int len);

#ifdef ENDIAN_SENSITIVE
#define dir_acnucopen dir_acnucopen_sensitive
#endif

#ifdef __alpha
#define OFFTFORMAT "l"
#else
#define OFFTFORMAT "ll"
#endif
#ifdef vms
#define MINRECSIZE 52
#define MAXDOCSTRING 256
#define MAXRECLENGTH 20000 /* the max rec size under VMS of .seq files for gcg*/
#endif

#endif /* DIR_ACNUC_H */