{ "cells": [ { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "import re\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Parse Clustal Omega alignment output" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "def read_clustal_omega(file_name: str) -> dict:\n", " with open(file_name, 'r') as reader:\n", " # skip the first 3 lines\n", " for _ in range(3):\n", " next(reader)\n", "\n", " res = {}\n", " for line in reader:\n", " line = line.strip()\n", " if not line.startswith(' ') and not line.startswith('\\t') and line and not '*' in line:\n", " tokens = re.split(r'[ ]+|\\t', line)\n", "\n", " if tokens[0] not in res:\n", " res[tokens[0]] = list(tokens[1])\n", " else:\n", " res[tokens[0]] = res[tokens[0]] + list(tokens[1])\n", "\n", " return res" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...1059106010611062106310641065106610671068
KM355915.1ATGGATTATC...----------
KM355864.1ATGGATTATC...----------
KM355862.1ATGGATTATC...----------
KM355863.1ATGGATTATC...----------
KM355861.1ATGGATTATC...----------
\n", "

5 rows × 1069 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... 1059 1060 \\\n", "KM355915.1 A T G G A T T A T C ... - - \n", "KM355864.1 A T G G A T T A T C ... - - \n", "KM355862.1 A T G G A T T A T C ... - - \n", "KM355863.1 A T G G A T T A T C ... - - \n", "KM355861.1 A T G G A T T A T C ... - - \n", "\n", " 1061 1062 1063 1064 1065 1066 1067 1068 \n", "KM355915.1 - - - - - - - - \n", "KM355864.1 - - - - - - - - \n", "KM355862.1 - - - - - - - - \n", "KM355863.1 - - - - - - - - \n", "KM355861.1 - - - - - - - - \n", "\n", "[5 rows x 1069 columns]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seq = read_clustal_omega('data/VARFREQ_2021/clustalo-I20210309-103503-0178-93081341-p1m.clustal_num')\n", "seq_df = pd.DataFrame.from_dict(seq, orient='index')\n", "seq_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count nt occurrences per position " ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ACGT-RSKYMN
0760000000000
1000760000000
2007600000000
3007600000000
4760000000000
5000760000000
6000760000000
7760000000000
8000760000000
9076000000000
\n", "
" ], "text/plain": [ " A C G T - R S K Y M N\n", "0 76 0 0 0 0 0 0 0 0 0 0\n", "1 0 0 0 76 0 0 0 0 0 0 0\n", "2 0 0 76 0 0 0 0 0 0 0 0\n", "3 0 0 76 0 0 0 0 0 0 0 0\n", "4 76 0 0 0 0 0 0 0 0 0 0\n", "5 0 0 0 76 0 0 0 0 0 0 0\n", "6 0 0 0 76 0 0 0 0 0 0 0\n", "7 76 0 0 0 0 0 0 0 0 0 0\n", "8 0 0 0 76 0 0 0 0 0 0 0\n", "9 0 76 0 0 0 0 0 0 0 0 0" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "counts = pd.DataFrame(0, index=['A','C','G','T','-','R','S','K','Y','M','N'], columns=seq_df.columns)\n", "\n", "for colname, col in seq_df.iteritems():\n", " temp = seq_df[colname].value_counts()\n", " \n", " for tidx in temp.index:\n", " counts.at[tidx, colname] = temp[tidx]\n", "\n", "counts.transpose().head(10)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "counts.transpose().to_excel(\"nt_frequency.xlsx\", sheet_name='nt frequency')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count missmatches" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "241" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mm = counts.transpose().sum(axis=1) - counts.transpose().max(axis=1)\n", "mm.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }