Ab sofort ist der Login auf der Weboberfläche von git.fh-muenster.de bevorzugt über FH Muenster SSO möglich.

Commit fb01642e authored by Daniel Lukats's avatar Daniel Lukats

some updates to title, main, glossary

parent b9e224c0
\begin{titlepage}
\centering
\begin{figure}
\begin{subfigure}{0.49\textwidth}
\hspace{0.22cm}\includegraphics[scale=1.0, left]{00_orga/fh_logo}
\end{subfigure}
\begin{subfigure}{0.49\textwidth}
\includegraphics[scale=1.0, right]{00_orga/eti_logo}
\end{subfigure}
\end{figure}
%\hrulefill
\vspace*{2cm}
{
\vspace*{2.5cm}
{\huge\bfseries Evaluating Hyperparameter and Implementation Choices of Proximal Policy Optimization}\\
\vspace{0.3cm}
{\Large{On a selection of ATARI 2600 Games}}\\
\vspace{1cm}
{\huge Master's thesis} \\
\vspace{0.3cm}
\vspace{3cm}
\vfill
\large{Daniel Lukats} \\
% \large{978528} \\ TODO
\today \\
\vspace{1cm}
\begin{table}[H]
\centering
\begin{tabular}{l l}
\large{First examiner} & \large{Prof. Dr.-Ing. Jürgen te Vrugt}\\
\large{Second examiner} & \large{Prof. Dr. Kathrin Ungru}
\end{tabular}
\end{table}
\vspace*{3cm}
}
\end{titlepage}
\section{List of Experiments}
TODO
......@@ -12,8 +12,8 @@ robotics tasks \cite{ilyas2018, engstrom2019}, but to the author's knowledge no
benchmarking tasks like ATARI 2600 video games. Therefore, this thesis explains the PPO clip variant of Proximal Policy
Optimization and evaluates the impact of the aforementioned optimizations on a selection of five ATARI games.
The experiments show that most optimization choices have a significant effect on the performance of the algorithm; only
a single optimization is found to be less impactful. Furthermore, the experiments support \citeauthor{ppo}'s
\citeyear{ppo} claim that PPO clip is robust to hyperparameter choices, as agents learn even when configured suboptimally.
Finally, significant deviations are apparent in approximately 35\% of the experiments. As a consequence, the reliability
of the evaluation methods employed by the authors of the original publication may be questioned.
The experiments reveal that most optimization choices have a significant effect on the performance of the algorithm;
only a single optimization is found to be less impactful. Furthermore, the experiments support \citeauthor{ppo}'s
\citeyear{ppo} claim that PPO clip is robust to hyperparameter choices, as agents learn even when configured
suboptimally. Finally, significant deviations are apparent in approximately 35\% of the experiments. As a consequence,
the reliability of the evaluation methods employed by the authors of the original publication may be questioned.
......@@ -23,9 +23,24 @@
description = {the acting and learning entity}
}
\newglossaryentry{ale-acr}{
name = {ALE},
description = {see \emph{Arcade Learning Environment}},
}
\newglossaryentry{ale}{
name = {Arcade Learning Environment},
description = {a framework for benchmarking reinforcement learning algorithms on ATARI 2600 games}
}
\newglossaryentry{dynamics}{
name = {dynamics},
description = {a probability distribution that determines the environment's behavior}
description = {a probability distribution that determines the environment's behavior. Denoted $p$}
}
\newglossaryentry{fitness}{
name = {fitness function},
description = {a metric that shall be optimized. Denoted $J$}
}
\newglossaryentry{entropybonus}{
......@@ -68,6 +83,11 @@
description = {a method for maximizing/minimizing multidimensional optimization problems}
}
\newglossaryentry{gym}{
name = {Gym},
description = {see \emph{OpenAI Gym}}
}
\newglossaryentry{horizon}{
name = {horizon},
description = {the final time step of an episode},
......@@ -89,9 +109,15 @@
determining the behavior of the agent and the environment}
}
\newglossaryentry{openaigym}{
name = {OpenAI Gym},
description = {a framework that includes several benchmark environments, e.g., robotics tasks and the Arcade
Learning Environment}
}
\newglossaryentry{policy}{
name = {policy},
description = {a probability distribution that determines the agent's behavior}
description = {a probability distribution that determines the agent's behavior. Denoted $\pi$}
}
\newglossaryentry{ppo-acr}{
......@@ -119,7 +145,7 @@
\newglossaryentry{rollout}{
name = {rollout},
description = {a sequence of states, actions, rewards, action probabilities and values generated from interaction of
the agent with the environment}
the agent with the environment. Denoted $\tau$}
}
\newglossaryentry{state}{
......@@ -132,6 +158,12 @@
description = {TODO}
}
\newglossaryentry{terminal}{
name = {terminal state},
description = {an absorbing state that the agent cannot leave. Transitioning to a terminal state marks the end of an
episode}
}
\newglossaryentry{trajectory}{
name = {trajectory},
description = {a sequence of states, actions and rewards generated from interaction of the agent with the
......
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
% Font
\usepackage{lmodern}
% \usepackage[headsepline]{scrlayer-scrpage}
% \clearpairofpagestyles
% Header
% FIGURES
\usepackage{graphicx}
......@@ -31,7 +33,8 @@
\usepackage{listings}
% TODO check hidelinks here
\usepackage[hidelinks]{hyperref} % must be loaded before apacite
\usepackage[hyphens]{url}
\usepackage[hidelinks,breaklinks]{hyperref} % must be loaded before apacite
\hypersetup{
pdfauthor = {Daniel Lukats},
pdftitle = {Master's thesis},
......@@ -61,3 +64,6 @@
\AtBeginEnvironment{thebibliography}{\interlinepenalty=10000}
\usepackage{csquotes}
\widowpenalty = 10000
\clubpenalty = 10000
......@@ -7,8 +7,8 @@
\begin{document}
\pagenumbering{gobble}
\newgeometry{left=1cm, right=1cm, top=2.5cm, bottom=2.5cm}
\input{title}
\newgeometry{left=1.5cm, right=1.5cm, top=2.5cm, bottom=2.5cm}
\input{00_orga/title}
\restoregeometry
\pagenumbering{roman}
......@@ -63,6 +63,10 @@
\bibliographystyle{apacite}
\bibliography{bibliography.bib}
\cleardoublepage
\appendix
\input{07_appendix/experiments.tex}
\clearpage
\input{oath.tex}
\input{00_orga/oath.tex}
\end{document}
\begin{longtable}{l l l}
$\doteq$ & defined to be & \\
$\sum_{s', r}$ & shorthand for $\sum_{s'\in\mathcal{S}}\sum_{r\in\mathcal{R}}$ & \\
$\#M$ & cardinality of the set $M$ & \\
$\propto$ & proportional to & \\
$a,s\sim\pi$ & $a,s$ observed by following $\pi$ & \\
$\mathbb{E}$ & expected value & \\
......@@ -20,7 +21,6 @@
\\
$p(s', r \mid s, a)$ & dynamics function & (chapter \ref{sec:02:distributions}, eq.~\ref{eqn:dynamics}) \\
$\pi(a\mid s)$ & policy & (chapter \ref{sec:02:distributions}, eq.~\ref{eqn:policy}) \\
$\mu(s)$ & stationary distribution of states & (chapter \ref{sec:02:function_approximation}) \\
\\
$\alpha$ & learn rate & (chapter \ref{sec:02:gradient_ascent}) \\
$\gamma$ & discount factor & (chapter \ref{sec:02:value_function}) \\
......@@ -39,6 +39,7 @@
$\hat{v}_\pi(s, \boldsymbol\omega)$ & parameterized value function & (chapter \ref{sec:02:function_approximation}) \\
$\hat{a}_\pi(s, a, \boldsymbol\omega)$ & parameterized advantage function & (chapter
\ref{sec:02:function_approximation}, eq.~\ref{eqn:a_hat}) \\
$\mu(s)$ & stationary distribution of states & (chapter \ref{sec:02:function_approximation}) \\
$\overline{\text{VE}}(\boldsymbol\omega)$ & mean squared value error & (chapter \ref{sec:02:function_approximation},
eq.~\ref{eqn:value_error}) \\
\\
......
\vspace{5cm}
\begin{center}
SPPOCK -- Solid PPO ChecK\\
{\small by Lukats}
\end{center}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment