\documentclass[11pt]{article}
\usepackage{graphicx}
\usepackage{times}
\usepackage{amssymb}
\usepackage{float}
\usepackage{amsmath,amssymb,amsfonts,bm}
\newcount\refno\refno=1
\def\ref{\the\refno \global\advance\refno by 1}
\def\ux{\underline{x}}
\def\uw{\underline{w}}
\def\ut{\underline{\theta}}
\def\umu{\underline{\mu}}
\def\be{p_e^*}
\newcount\eqnumber\eqnumber=1
\def\eq{\the \eqnumber \global\advance\eqnumber by 1}
\def\eqs{\eq}
\def\eqn{\eqno(\eq)}
\pagestyle{empty}
\def\baselinestretch{1.1}
\topmargin1in \headsep0.3in
\topmargin0in \oddsidemargin0in \textwidth6.5in \textheight8.5in
\begin{document}
\setlength{\parskip}{1.2ex plus0.3ex minus 0.3ex}
\thispagestyle{empty} \pagestyle{myheadings} \markright{Homework
2: CS 274A, Probabilistic Learning: Winter 2018}
\title{CS 274A Homework 2}
\author{Probabilistic Learning: Theory and Algorithms, CS 274A, Winter 2018}
\date{Due Date: submit hardcopy to Reader, DBH 3209 between 3pm to 5pm , Friday January 26th}
\maketitle
\section*{Instructions and Guidelines for Homeworks}
\begin{itemize}
\item
Please answer all of the questions and submit a {\bf hardcopy} of your written solutions
(either hand-written or typed are fine as long as the writing is legible). Clearly mark
your name on the first page.
%Code (if requested) should be submitted to the EEE dropbox. No
%need to submit any code unless we request it.
\item
All problems are worth 10 points unless otherwise stated. All homeworks will get equal weight in computation of the final grade for the class.
\item
The homeworks are intended to help you work through the concepts
we discuss in class in more detail. It is important that you try
to solve the problems yourself. The homework problems are important to help you better
learn and reinforce the material from class. If you don't
do the homeworks you will likely have difficulty in the exams
later in the quarter.
\item If you can't solve a
problem, you can discuss it {\it verbally} with another student. However, please note that before you submit your homework solutions you
are not allowed to view (or show to any other student) any {\it written material} directly related to the homeworks, including other students' solutions or drafts of solutions, solutions from previous versions of this class, and so forth. The work you hand in should be your own original work.
\item You are allowed to use reference materials in your solutions, such as class notes, textbooks, other reference material (e.g., from the Web), or solutions to other problems in the homework. It is strongly recommended that you first try to solve the problem yourself, without resorting to looking up solutions elsewhere. If you base your solution on material that we did not discuss in class, or is not in the class notes, then you need to clearly provide a reference, e.g., ``based on material in Section 2.2 in ....."
\item
In problems that ask for a proof you should submit a complete mathematical
proof (i.e., each line must follow logically from the preceding one, without
``hand-waving"). Be as clear as possible in explaining
your notation and in stating your reasoning as you go from line to line.
\item
If you wish to use LaTeX to write up
your solutions you may find it useful to use the .tex file for this homework
that is posted on the Web page.
\end{itemize}
\vfill\eject
\noindent{\bf Background Reading}: Note Set 3 is relevant for the problems below.
\subsection*{Problem \ref: (Log-Likelihood Functions)}
Consider the following data set $D=\{3, 5, 8, 9, 7, 12, 10, 6, 13,
7\}$. Use MATLAB (or Python, or R, or something similar)
to generate graphs of the log-likelihood function for each of the following cases:
\begin{enumerate}
\item a Gaussian model with $\mu$ as the unknown parameter in the log-likelihood function
and with a fixed standard deviation of $\sigma = 3$.
\item a uniform distribution with $a=2$ and $b$ as the unknown
parameter in the log-likelihood function
\item an exponential distribution with the exponential parameter as the unknown parameter in the log-likelihood function.
\end{enumerate}
In each case you can the plot a range of values around the mode of the log-likelihood,
e.g., if $\theta$ is the mode you could plot in the range $[0.2\theta, 2\theta]$.
Comment on the shape of each of the 3 plots. Please submit a hardcopy of your graphs with your homework.
\noindent{\bf Note:} In the next several problems
below assume that
a data set $D = \{x_1,\ldots,x_n\}$ exists. You can also assume
that the $x_i$'s are conditionally independent of each other
given the parameters of the model.
\subsection*{Problem \ref: (Maximum Likelihood for the Gaussian Model)}
Let $f(x;\theta)$ be a Gaussian density function, i.e.,
\[
f(x;\theta) \ = \ \frac{1}{\sqrt{2 \pi \sigma^2}}
e^{-\frac{1}{2 \sigma^2} (x - \mu)^2}
\]
Derive formulas defining the maximum likelihood estimates of $\mu$ and $\sigma^2$.
\subsection*{Problem \ref: (Maximum Likelihood for the Geometric Model)}
Derive the maximum-likelihood estimator for the geometric
distribution with parameter $p$, where the geometric distribution is defined as
\[
P(X = k) \ = \ (1-p)^k p \ \ \ \ , k = 0,1, 2, 3, \ldots, \ \ \ 0 < p < 1
\]
\subsection*{Problem \ref: (Maximum Likelihood for the Poisson Model)}
Derive the maximum-likelihood estimator for the Poisson distribution, where \[ P(X=k) = \frac{e^{-\lambda}
\lambda^k}{k!}, \] with parameter $\lambda > 0 $ i and
where $k \in \{0, 1, 2, 3, \ldots\}$.
\subsection*{Problem \ref: (Maximum Likelihood: Comparing 2 Models)}
Assume you are working for a large search engine company and you wish to model the
distribution of the number of search results that a user clicks on for a typical search.
Let $X$ be a random variable taking values $k = 0, 1, 2, \ldots...$, where $k$
represents the number of clicks. $P(X)$ represents the distribution of number of clicks
for a randomly selected user.
\begin{enumerate}
\item Let the data set $D$ consist of observations from 100 users, summarized
by the following table of values:
\begin{verbatim}
value k number of users with this value k
0 8
1 19
2 31
3 21
4 10
5 9
6 1
7 1
\end{verbatim}
Letting $n_k$ be the number of users with $k$ clicks, write down an expression
for the likelihood for a Poisson model and for a geometric model.
\item
Plot the log-likelihood for each of the geometric and Poisson models as a
function of their respective parameters for this data set.
\item Using the maximum likelihood estimates of the parameters (as derived in the
earlier problems), on a single
plot with the x-axis running from 0 to 10, plot the following:
\begin{itemize}
\item The empirical probability (from the data) of each value
\item The probability distribution for the geometric model
\item The probability distribution for the Poisson model
\end{itemize}
\item Is the geometric or Poisson model a better fit to this data? Hint: you can use the log-likelihood of the
observed data for each model, conditioned on the maximum likelihood estimate of the parameters, as a metric for
seeing which model has a better fit.
\end{enumerate}
You may want to use R or Python or MATLAB to generate the plots above if you
wish (you do not need to submit any code with your solution but do submit your plots).
\subsection*{Problem \ref: (Maximum Likelihood for the Uniform Model)}
Let $X$ be uniformly distributed with lower limit $a$ and upper
limit $b$, where $b>a$, i.e.,
\[ p(x) = \frac{1}{b-a} \]
for $a \le x \le b$ and $p(x) = 0$ otherwise.
\begin{enumerate}
\item Derive maximum likelihood estimators for $a$ and $b$ (think
carefully about how to do this).
\item Write 2 or 3 sentences discussing these maximum likelihood estimators, e.g., do they make intuitive sense? if not, briefly describe an alternative method for
estimating the parameters.
\end{enumerate}
\subsection*{Problem \ref: (Maximum Likelihood for the Multinomial Model)}
Consider building a probabilistic model for how often words occur in English.
Let $W$ be a random variable, taking values $w \in \{w_1, \ldots, w_V\}$, where
$V$ is the number of words in the vocabulary. In practice $V$ can be very
large, e.g., $V=100,000$ is not unusual (there are more words than this in
English, but many rare words are not modeled).
The {\it multinomial model} for $W$ is essentially the same as the binomial
model for tossing coins, where we have independent trials, but instead of two
possible outcomes there are now $V$ possible outcomes for each ``trial". The
parameters of the multinomial are $\theta = \{\theta_1,\ldots, \theta_V\}$,
where $\theta_k = P(W = w_k)$, and where $\sum_{k=1}^V \theta_k = 1$. Denote
the observed data as $D = \{r_1,\ldots,r_V\}$, where $r_k$ is the number of
times word $k$ occurred in the data (these are the sufficient statistics for
this model).
Derive the maximum likelihood estimates for $\theta_k, 1 \le k \le V$ for this model.
\subsection*{Problem \ref: (Maximum Likelihood for a Markov Chain)}
Consider a data set where we have $M$ sequences, e.g., $M$ documents where each document consists of
a sequence of words. Let $x_{m,t}$ be the value of the $m$th sequence in the $t$th position, with
$1 \le m \le M$ and $x_{m,t} \in \{1, \ldots, K\}$ (e.g., $K$ unique words in the vocabulary). Let $T_m$
be the length of sequence $m$.
Assume the sequences are generated conditionally independently of each other given the parameters
of the model below.
Assume that we will use a first-order Markov chain to model the sequences. The unknown
parameters of the model are the probabilities in the $K \times K$ transition
matrix $\theta_{i,j} = P(x_{mt} = j | x_{m,t-1} = i), 1 \le i, j \le K, t > 1$, i.e.,
$\theta_{i,j}$ is the probability of the chain transitioning from $i$ to $j$.
Assume that the initial probability distribution for the chain $P(x_{m,1})$ is known.
Let $N_{m,i,j}$ and $N_{m,i}$ be the number of times that sequence $m$ transitions from $i$ to
$j$ and the number of times that sequence $m$ is in state $i$, respectively.
\begin{enumerate}
\item Define the likelihood for a particular sequence $m$ as a function
of $\theta_{i,j}$, $N_{m,i,j}$ , $N_{m,i}$, and the initial state probabilities.
\item Define the likelihood for all $M$ sequences.
\item Prove that the maximum likelihood estimate for $\theta_{i,j}$ can be written as
\[
\hat{\theta}_{i,j} = \frac{ \sum_{m=1}^M N_{m, i, j} } {\sum_{m=1}^M N_{m,i} }
\]
\item Explain in words why it would be suboptimal to instead estimate a separate
maximum likelihood estimate $ \theta_{m,i,j}^{ML}$ for each sequence (just based on the data for each
sequence) and to then average these estimates across the $M$ sequences.
\end{enumerate}
\end{document}