#!venv/bin/python2.7 """\ - Download the supplementary code from Doench16_:: $ curl -O http://www.nature.com/nbt/journal/v34/n2/extref/nbt.3437-S3.zip - Unpack the files:: $ unzip nbt.3437-S3.zip $ tar -xzf Code/Rule_Set_2_Score.tar.gz - Create a virtual environment to run the code in. This is necessary both because the script is ``python2`` and because it requires a very specific version of ``scikit-learn`` in order to unpack its pickles:: $ virtualenv -p python2.7 venv $ source venv/bin/activate - Install the dependencies. For some reason this has to be done in two steps:: $ pip install numpy scipy pandas matplotlib biopython $ pip install scikit-learn==0.16.1 - The ``rs_score_calculator.py`` script has to be run from the directory it lives in:: $ cd Rule_Set_2_scoring_v1/analysis $ python rs2_score_calculator.py --seq AAAAAAAAAAAAAAAAAAAAAAAAAGGAAA Rule set 2 score: 0.2183 The Doench16_ rules take 30 bp sequences which contain some context before and after the spacer itself, specifically ``NNNN[spacer]NGGNNN``. I need my spacers to start with ``GGG``, so I generated spacers from this pattern: ``NNNNGGG[17×N]NGGNNN``. I set the random seed to 0 to make the results reproducible. .. [Doench16] Doench et al. *Optimized sgRNA design to maximize activity and minimize off-target effects of CRISPR-Cas9.* Nat Biotechnol (2016) 34:2:184-191. """ import os, pickle import random; random.seed(0) import sys; sys.path.append('Rule_Set_2_scoring_v1/analysis') from model_comparison import predict as rule_set_2 from pprint import pprint num_spacers = 48 filter_factor = 2 model_file = 'Rule_Set_2_scoring_v1/saved_models/V3_model_nopos.pickle' with open(model_file, 'rb') as file: model = pickle.load(file) def generate_spacer(): # # Start the spacer with 'GGG' so T7 can efficiently transcribe it. n = lambda n: ''.join(random.choice('ATGC') for _ in range(n)) return n(4) + 'GGG' + n(18) + 'GG' + n(3) def score_spacer(spacer): # return rule_set_2(spacer, -1, -1, model=model) if __name__ == '__main__': spacers = [] num_spacers = 48 while len(spacers) < num_spacers: name = 'd{}'.format(len(spacers) + 1) spacer = generate_spacer() score = score_spacer(spacer) if score > 0.5: spacers.append((name, spacer, score)) tsv_path = os.path.join( os.path.dirname(__file__), '..', 'doench_spacers.tsv') with open(tsv_path, 'w') as file: for row in spacers: file.write('\t'.join(str(x) for x in row) + '\n')