\documentclass[12pt]{article} \usepackage{fullpage} \usepackage{microtype} % microtypography \usepackage{array} \usepackage{amsmath,amssymb,amsfonts} \usepackage{amsthm} %% Header \usepackage{fancyhdr} \fancyhf{} \fancyhead[C]{COMP 136 - 2021s - HW1 Submission} \fancyfoot[C]{\thepage} % page number \renewcommand\headrulewidth{0pt} \pagestyle{fancy} %% Hyperlinks always blue, no weird boxes \usepackage[hyphens]{url} \usepackage[colorlinks=true,allcolors=black,pdfborder={0 0 0}]{hyperref} %%% Doc layout \usepackage{parskip} \usepackage{times} %%% Write out problem statements in blue, solutions in black \usepackage{color} \newcommand{\officialdirections}[1]{{\color{blue} #1}} %%% Avoid automatic section numbers (we'll provide our own) \setcounter{secnumdepth}{0} \begin{document} ~~\\ %% add vert space {\Large{\bf Student Name: TODO}} {\Large{\bf Collaboration Statement:}} Turning in this assignment indicates you have abided by the course Collaboration Policy: \url{www.cs.tufts.edu/comp/136/2021s/index.html#collaboration-policy} Total hours spent: TODO I consulted the following resources: \begin{itemize} \item TODO \item TODO \item $\ldots$ \end{itemize} FYI Official instructions for all problems can be found at: \url{www.cs.tufts.edu/comp/136/2021s/hw1.html} \tableofcontents \newpage \section*{Problem 1: Mean for the Beta and Dirichlet} \officialdirections{ \subsection*{1a: Problem Statement} Let $\rho \in (0.0, 1.0)$ be a Beta-distributed random variable: $p \sim \text{Beta}(a, b)$. Show that $\mathbb{E}[ \rho ] = \frac{a}{a + b}$. **Hint:** You can use these identities, which hold for all $a > 0$ and $b > 0$: \begin{align} \Gamma(a) &= \int_{t=0}^{\infty} e^{-t} t^{a-1} dt \\ \Gamma(a+1) &= a \Gamma(a) \\ \int_{0}^1 \rho^{a-1} (1-\rho)^{b-1} d\rho &= \frac{\Gamma(a)\Gamma(b)}{\Gamma(a+b)} \end{align} } \subsection{1a: Solution} TODO YOUR SOLUTION HERE \newpage \officialdirections{ \subsection*{1b: Problem Statement} Let $\mu$ be a Dirichlet-distributed random variable: $\mu \sim \text{Dir}(a_1, \ldots a_V)$. Show that $\mathbb{E}[ \mu_w ] = \frac{a_w}{\sum_{v=1}^V a_v}$, for any integer $w$ that indexes a vocabulary word. ** Hint:** You can use the identity: \begin{align} \int \mu_1^{a_1-1} \mu_2^{a_2 - 1} \ldots \mu_V^{a_V-1} d\mu &= \frac {\prod_{v=1}^V \Gamma(a_v)} {\Gamma(a_1 + a_2 \ldots + a_V)} \end{align} } \subsection{1b: Solution} TODO YOUR SOLUTION HERE \newpage \officialdirections{ \subsection*{2a: Problem Statement} Show that the likelihood of all $N$ observed words can be written as: \begin{align} p(X_1 = x_1, X_2 = x_2, \ldots, X_N = x_N | \mu) = \prod_{v=1}^V \mu_v^{n_v} \end{align} } \subsection{2a: Solution} TODO YOUR SOLUTION HERE \newpage \officialdirections{ \subsection*{2b: Problem Statement} Derive the probability mass function for the predictive posterior. That is, show that after seeing the $N$ training words, the probability of the next word $X_*$ being vocabulary word $v$ is: \begin{align} p( X_* = w | X_1 = x_1 \ldots X_N = x_n, \alpha) = \frac{n_w + \alpha}{N + V\alpha} \end{align} } \subsection{2b: Solution} TODO YOUR SOLUTION HERE \newpage \officialdirections{ \subsection*{2c: Problem Statement} Derive the probability mass function for the joint configuration of the observed training data. That is, show that the probability of the observed $N$ training words is: \begin{align} p( X_1 = x_1 \ldots X_N = x_N | \alpha) = &= \frac { \Gamma(V \alpha) \prod_{v=1}^V \Gamma( n_v + \alpha ) } { \Gamma(N + V \alpha ) \prod_{v=1}^V \Gamma(\alpha) } \end{align} } \subsection{2c: Solution} TODO YOUR SOLUTION HERE \end{document}