<?xml version="1.0"?>
<!DOCTYPE gs540_hw [ 
  <!ELEMENT gs540_hw (results, program)>
  <!ELEMENT results ANY>
  <!ELEMENT program (comments, file+)>
  <!ELEMENT comments ANY>
  <!ELEMENT file ANY>
  <!ELEMENT result ANY>
  <!ATTLIST gs540_hw assignment CDATA #REQUIRED>
  <!ATTLIST gs540_hw name CDATA #REQUIRED>
  <!ATTLIST gs540_hw email CDATA #REQUIRED>
  <!ATTLIST result type (first_line|nucleotide_histogram|background_frequency|count_matrix|frequency_matrix|weight_matrix|score_histogram|position_list) #REQUIRED>
  <!ATTLIST result file CDATA #REQUIRED>
  <!ATTLIST result positions CDATA #IMPLIED>
  <!ATTLIST file name CDATA #REQUIRED>
]>
<gs540_hw assignment='4' name='student name' email='student email'>  
  <results>
    <result type='first_line' file='filename'>
      first line of the .gbk file
    </result>
    <result type='nucleotide_histogram' file='filename'>
       This should give, for each base or 'ambiguity code' occurring in the sequence,
       the letter denoting the base, followed by an equals sign, followed by an 
       integer giving the number of times the base occurs in the sequence and its complement.
       Put a comma between the different bases.
       E.g. A=50,C=50,G=50,T=50,N=2
    </result>
    <result type='background_frequency' file='filename'>
       Like nucleotide histogram, but giving fraction of times (to 4 decimal places) 
       each nucleotide occurs in the sequence and its complement. In computing these, ignore
       ambiguity-coded nucleotides.
       E.g. for the counts given as above one would get A=.2500,C=.2500,G=.2500,T=.2500
    </result>
    <result type='count_matrix' file='filename'>
       Put the matrix of nucleotide counts at each position in known translation start sites 
       here, as a list (pos,nuc)=count,...
       For example (-10,A)=13,(-10,C)=103,(-10,G)=105,(-10,T)=15,(-9,A)=27, ...
       where the interpretation is that nucleotide A occurs 13 times at position -10 
       in known translation start sites, etc.
       Ignore occurrences of ambiguity-coded nucleotides at each position.
    </result>
    <result type='frequency_matrix' file='filename'>
       Like count matrix, but indicating the fraction of times (to 4 decimal places) 
       each nucleotide occurs at each position, rather than
       the total counts: e.g. (-10,A)=.0551,(-10,C)=.4364, ...
    </result>
    <result type='weight_matrix' file='filename'>
       Like frequency matrix, but giving weight. Give values to three decimal places:
       e.g. (-10,A)=-4.184, ...
    </result>
    <result type='score_histogram' positions='true sites' file='filename'>
      This should be a list of the form (i,n) where i is an integer and n gives the number of times
       a score g.t.e. i and l.t. i+1 occurred, for the true start sites. Omit cases i if no score in that 
       range was observed. Also omit all i's corresponding to scores l.t. -50; but include an entry (l.t.-50,n) 
       indicating the number of times a score l.t. -50 occurred.
       E.g. (l.t.-50,403),(-50,35),(-49,17),...
    </result>
    <result type='score_histogram' positions='all' file='filename'>
       As above, but for all positions in the genome (and its complement).
    </result>
    <result type='position_list' file='filename'>
       A list of positions in the genome where scores g.t.e. 5.0 occurred but which do NOT correspond to
       an annotated translation start site. These should be given in the form (p,strand,score) where
       p indicates position (in top strand, origin 1 co-ordinates), strand = 0 (for top) or 1 (for bottom),
       and score is given to 3 decimal places.
       E.g. (15774,0,5.310),(16007,1,7.632),...
    </result>
  </results>
  <program>
    <comments> 
      put comments about your code here 
    </comments>
    <file name='filename'>
       <![CDATA[
          program source code here...
          if your program outputs the '] ] >' tag shown below, 
          please split the output up to avoid having the tag in your 
          file before it's end. 
          (i.e. sys.stdout.write("]]") ; sys.stdout.write(">")
       ]]> 
    </file>
  </program>
</gs540_hw>


