{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e013e737-e3a7-4cf9-9d5b-d6f99cb0bfc9", "metadata": {}, "outputs": [], "source": [ "# This is a new library compared to the previous modules. \n", "# Please perform \"pip install sentence_transformers==2.7.0\"\n", "from sentence_transformers import SentenceTransformer\n", "\n", "#sentence transformer helps in calling pretrained models from huggingface\n", "model = SentenceTransformer(\"multi-qa-distilbert-cos-v1\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "db3836de-9292-4882-8ae5-db988bacf401", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([ 7.82226548e-02, -4.01311405e-02, 3.86135913e-02, -1.78966438e-04,\n", " 8.92347097e-02, -5.04591092e-02, -1.05026569e-02, 3.71055678e-02,\n", " -4.18713912e-02, 3.48084792e-02, -1.20701883e-02, -2.36942340e-02,\n", " 3.87900174e-02, 1.60988607e-02, 3.50747295e-02, 3.04746162e-03,\n", " 5.79672381e-02, -4.10627462e-02, -3.41552682e-02, -2.56396383e-02,\n", " -3.55263911e-02, 1.42908087e-02, -1.62799917e-02, 3.21446545e-02,\n", " -4.66897376e-02, 7.89186060e-02, 4.90160920e-02, 1.56761166e-02,\n", " -1.69110075e-02, 2.26482227e-02, 5.60206100e-02, -3.98361087e-02,\n", " 6.77409917e-02, -1.20209912e-02, 1.12621894e-03, -1.94394365e-02,\n", " -2.65951678e-02, 1.06177367e-02, 1.69687122e-02, 1.13487840e-02,\n", " -2.97063086e-02, 5.25258258e-02, -1.41453547e-02, 4.61699851e-02,\n", " 1.17066065e-02, -2.38053519e-02, -6.32558241e-02, -1.92042235e-02,\n", " -7.10592186e-03, 3.24167833e-02, 2.49618199e-02, -5.27503015e-03,\n", " 2.01149024e-02, -3.72371152e-02, 3.46405394e-02, -3.29310261e-02,\n", " -2.01484803e-02, 5.07839303e-03, -4.55506742e-02, 7.89169688e-03,\n", " -4.91713583e-02, 4.69897278e-02, -3.80505981e-06, 2.48839278e-02,\n", " -2.96471510e-02, 6.69760117e-03, 3.58087718e-02, -7.18094781e-03,\n", " -3.03277262e-02, 7.95786362e-03, 3.36150825e-02, -2.25491151e-02,\n", " -3.62097844e-02, -2.03929059e-02, -2.70679290e-03, -8.15981850e-02,\n", " -1.43993227e-02, 4.90568765e-02, -2.11568642e-02, 9.31681832e-04,\n", " 6.57264590e-02, 4.69609909e-02, 1.23655573e-02, -2.84593143e-02,\n", " -2.57528741e-02, 1.46925887e-02, -5.53987361e-02, -3.05970423e-02,\n", " 5.75574040e-02, 2.81941295e-02, -1.30107012e-02, -2.08614748e-02,\n", " -2.43033655e-02, -2.40275078e-02, 4.11576889e-02, -5.92422858e-02,\n", " -1.69009902e-03, 4.00552042e-02, 2.43852679e-02, -1.10905133e-02,\n", " -2.93893088e-02, 2.01826487e-02, 4.22846200e-03, 2.95343306e-02,\n", " 3.16523425e-02, 3.00929882e-02, 8.98854155e-03, -8.30620825e-02,\n", " -2.24144291e-02, 1.82182994e-02, -5.66515476e-02, -4.21271920e-02,\n", " 3.27233039e-02, -2.10854225e-02, -4.79223672e-03, 1.90163292e-02,\n", " 4.55397591e-02, -9.62481555e-03, -2.30316184e-02, -3.52343246e-02,\n", " -3.91419008e-02, -3.46547812e-02, 2.47354899e-02, 4.71566729e-02,\n", " 3.46270725e-02, 5.29964082e-02, -2.86405422e-02, -1.71785112e-02,\n", " -1.30146304e-02, -5.79404645e-02, 3.99250425e-02, -5.88766523e-02,\n", " 4.57601547e-02, 2.09622178e-02, -1.52759813e-02, 3.25414911e-02,\n", " 2.08252836e-02, -2.48042028e-02, -4.24147137e-02, 2.95954887e-02,\n", " -4.62034531e-02, 3.17028686e-02, -7.64973415e-03, 5.84934885e-03,\n", " -3.90673876e-02, 8.11777450e-03, -3.60446684e-02, 6.34325966e-02,\n", " -4.23138998e-02, -1.57085271e-03, 9.18294396e-03, -2.99421139e-02,\n", " -2.27955412e-02, -9.86715313e-03, 2.94452514e-02, 1.75152998e-02,\n", " -1.93444565e-02, 2.40450036e-02, 2.92803030e-02, 4.78763308e-04,\n", " 2.55402960e-02, -3.34960185e-02, 3.04799844e-02, -9.45277605e-03,\n", " 3.35257240e-02, 5.77760562e-02, -1.58551391e-02, -7.15148151e-02,\n", " 6.71641843e-04, -7.77694816e-03, -5.37220389e-02, -1.58578809e-02,\n", " 2.93348879e-02, -5.35059497e-02, -2.82784868e-02, -3.81728970e-02,\n", " 1.05941994e-03, 3.31668765e-03, 2.60325111e-02, -2.05704104e-02,\n", " -2.55044121e-02, -2.73665562e-02, 2.12153289e-02, -3.17961685e-02,\n", " -2.71619149e-02, -3.00257225e-02, -5.35569862e-02, 1.62939113e-02,\n", " 2.55571841e-03, 7.67827779e-02, 3.24226320e-02, 3.80413905e-02,\n", " -2.25355513e-02, -4.81641367e-02, 2.26790644e-02, 1.25548532e-02,\n", " -4.78156134e-02, 4.13825735e-02, -9.52939037e-03, 3.37111093e-02,\n", " 3.21243405e-02, 5.91824725e-02, -7.25298226e-02, -7.38689676e-03,\n", " -3.22184600e-02, 3.09309158e-02, 5.23214079e-02, 3.24243642e-02,\n", " -3.95455174e-02, -2.63985246e-02, -2.35443041e-02, 2.39192112e-03,\n", " 3.20997350e-02, 8.44291691e-03, 9.85345244e-03, 1.95927639e-02,\n", " 3.99371944e-02, 4.52749990e-02, 3.52183990e-02, 1.67367645e-02,\n", " 2.65721064e-02, -8.88593495e-03, -1.27366967e-02, -5.89371249e-02,\n", " -2.89509296e-02, 2.18168125e-02, -4.62896600e-02, -5.12742111e-03,\n", " -2.73846313e-02, -4.35681045e-02, -3.33959796e-02, 2.61665066e-03,\n", " 6.77051917e-02, -6.68666838e-03, 4.25815843e-02, -8.47998168e-03,\n", " -4.45969924e-02, -4.92077544e-02, 2.54241284e-02, 3.41309234e-02,\n", " 4.66176234e-02, 3.41222510e-02, -3.89920324e-02, 6.68450594e-02,\n", " 6.32637590e-02, -1.53560825e-02, -6.43526320e-04, 1.88045055e-02,\n", " 1.10457549e-02, -2.76161581e-02, 4.89471853e-02, -6.65619299e-02,\n", " 4.41557867e-03, -8.06404930e-03, -7.56582916e-02, 5.20581715e-02,\n", " -1.68674570e-02, -1.51841035e-02, 2.59556379e-02, 4.38798638e-03,\n", " 1.29939690e-02, 2.37958338e-02, -3.92605551e-02, 3.40803619e-03,\n", " -4.65255305e-02, -5.80669269e-02, -4.86324355e-02, 3.85592617e-02,\n", " 1.58163607e-02, -3.55917811e-02, -6.13349043e-02, -4.66559343e-02,\n", " 3.48288640e-02, -3.00835352e-02, -3.80521938e-02, 5.35570718e-02,\n", " -4.42223065e-02, -4.11476716e-02, 2.34690122e-02, 4.05010542e-05,\n", " -2.18803668e-03, -2.06341594e-03, -4.33782414e-02, -6.21988031e-04,\n", " -4.64339629e-02, 8.27862918e-02, -1.49072725e-02, 3.24270837e-02,\n", " 1.35792540e-02, -1.49164279e-03, 8.62602443e-02, 6.92182630e-02,\n", " 4.57403949e-03, 3.55520216e-03, 6.91157132e-02, -9.68690887e-02,\n", " 3.21001112e-02, -1.90142468e-02, 7.19640031e-02, 7.18858466e-02,\n", " 2.10939180e-02, -8.37227912e-04, -2.27937177e-02, 1.27857607e-02,\n", " 9.23394412e-02, -5.41783869e-02, 4.61493842e-02, -7.08942907e-03,\n", " -3.20987105e-02, -3.81699614e-02, -4.22405638e-02, 5.16356155e-02,\n", " 1.07124997e-02, -5.59868626e-02, -2.29028445e-02, 3.15916613e-02,\n", " -5.60135320e-02, 1.22199180e-02, -1.85765512e-02, -3.86150531e-03,\n", " 5.66245541e-02, 3.09960041e-02, 3.28162983e-02, 5.95071204e-02,\n", " -1.15265511e-02, 2.45990828e-02, 1.98689359e-03, 3.61350439e-02,\n", " 7.19022304e-02, 6.74675079e-03, -2.22808663e-02, 3.80802527e-02,\n", " -3.19978781e-02, 4.77899686e-02, -4.88462038e-02, -2.62966491e-02,\n", " -9.14799236e-03, -3.70225236e-02, -2.10673101e-02, 3.66252549e-02,\n", " -2.93567218e-03, 1.90015603e-02, -4.06738557e-02, -9.88849811e-03,\n", " -1.32069951e-02, -1.84205808e-02, -3.04457080e-02, -1.17648123e-02,\n", " -2.16220692e-02, -1.32907946e-02, -4.90810424e-02, 3.75495143e-02,\n", " 2.36799140e-02, 6.20988198e-03, 1.94518864e-02, 1.11171426e-02,\n", " -1.27271013e-02, -1.16941072e-02, -4.15579043e-02, 3.03730648e-03,\n", " 3.35785486e-02, 2.02150960e-02, -5.23533672e-02, -1.03417831e-02,\n", " -3.44100222e-02, -1.83595158e-02, 1.93087698e-03, 3.44448425e-02,\n", " -2.83047985e-02, 7.25740707e-03, 5.40008917e-02, -2.32120547e-02,\n", " -2.37849094e-02, 5.14848623e-03, -1.04726683e-02, -3.04511134e-02,\n", " -1.45228235e-02, 5.83771989e-02, 6.24929834e-03, 1.76533423e-02,\n", " 2.71377694e-02, -2.07602661e-02, -4.82953712e-03, 4.38607484e-02,\n", " 4.89790700e-02, -1.84726194e-02, 2.23985426e-02, 2.48786900e-02,\n", " -7.83604383e-03, 2.70838495e-02, 8.69912207e-02, 5.14607914e-02,\n", " 2.68816669e-02, -2.45199017e-02, 1.98570788e-02, 2.55423058e-02,\n", " -4.49709669e-02, 1.18260039e-02, -5.71240447e-02, 6.86878189e-02,\n", " 2.32738610e-02, 4.34287377e-02, 1.60184391e-02, 3.36551219e-02,\n", " 1.18744997e-02, 1.84787195e-02, 2.18717437e-02, 8.06097686e-03,\n", " 2.65186783e-02, -1.67910997e-02, -1.14257671e-02, 5.79696000e-02,\n", " 2.19955631e-02, -7.88591802e-02, 4.26271632e-02, -7.00563341e-02,\n", " 5.12132347e-02, 2.96157692e-02, 4.99608479e-02, 9.40513331e-03,\n", " -4.73498330e-02, 4.21216972e-02, -1.81927457e-02, -7.69210979e-02,\n", " 9.72863659e-03, -6.79067299e-02, 1.55991707e-02, 2.72873025e-02,\n", " 4.54439968e-03, -2.10018735e-03, 7.53396973e-02, 1.33481680e-03,\n", " -1.99394729e-02, -5.24484590e-02, 3.05672060e-03, 1.98291969e-02,\n", " 2.39940807e-02, 1.24235880e-02, 1.47050265e-02, 1.03529589e-02,\n", " -4.38815281e-02, 4.75225709e-02, 3.21548879e-02, 6.52114162e-04,\n", " -2.02529635e-02, 4.32258882e-02, -2.71596666e-02, 1.38091687e-02,\n", " -3.86391468e-02, 2.84993276e-02, -2.27448484e-03, 3.99671830e-02,\n", " 1.53477099e-02, 1.32416096e-02, -7.11276606e-02, 3.93648371e-02,\n", " 2.51131710e-02, -2.36388445e-02, 1.80941354e-02, -2.43954528e-02,\n", " 2.19318410e-03, 3.75051983e-02, 1.56092057e-02, 7.14064203e-03,\n", " -3.41359712e-02, 7.57620391e-03, 2.62274388e-02, 1.14289289e-02,\n", " 3.52655686e-02, 2.13690219e-03, -4.56283707e-03, -2.63882335e-02,\n", " 6.55588508e-02, 6.00263663e-02, -3.54714617e-02, -1.01350760e-02,\n", " 2.56268885e-02, -1.02697171e-01, 3.54785323e-02, -6.22319356e-02,\n", " -1.14733977e-02, -3.44268000e-03, 1.05505169e-03, -2.81982776e-03,\n", " 6.26723096e-02, -4.57265675e-02, 1.70462877e-02, -8.05483162e-02,\n", " 1.55107807e-02, -2.58711661e-04, 2.01547276e-02, 8.79565068e-03,\n", " 2.62557846e-02, 8.31562385e-04, -3.23720016e-02, -5.27925603e-02,\n", " 1.55638680e-02, 1.15739089e-02, -2.99575040e-03, -8.24663788e-03,\n", " -3.26189138e-02, -7.73414597e-02, -4.12966944e-02, -2.21012570e-02,\n", " 4.75402828e-03, -1.25991041e-02, 1.07523538e-02, 4.99581657e-02,\n", " 2.15218812e-02, 2.69317944e-02, 3.28038894e-02, 7.46345147e-03,\n", " 1.83876902e-02, -2.74957772e-02, -9.13319178e-03, -1.22790448e-02,\n", " 2.07576193e-02, 1.40681854e-02, 1.09913824e-02, -2.26452295e-02,\n", " 6.41365945e-02, -1.79608297e-02, -4.23230492e-02, -1.88530446e-03,\n", " -3.59299891e-02, 1.20294318e-02, 1.00361360e-02, 6.56100512e-02,\n", " 2.87165996e-02, 2.78465226e-02, -5.23849875e-02, 3.18163037e-02,\n", " -4.65675965e-02, 1.38317067e-02, 2.83224564e-02, 7.78382039e-03,\n", " -9.74422507e-03, -3.97435762e-02, -2.49803960e-02, -2.32911278e-02,\n", " -2.11589038e-02, -6.84602931e-03, -1.95434690e-02, -2.29272041e-02,\n", " -1.46510219e-02, -2.22448800e-02, -1.09536527e-02, -3.67134400e-02,\n", " -1.84324663e-02, -1.04103079e-02, -8.88101105e-03, -1.47620523e-02,\n", " -2.92523894e-02, 1.00569651e-02, 2.28201374e-02, 1.42327184e-03,\n", " -2.35474743e-02, -2.39605289e-02, 5.40141128e-02, -2.16813292e-02,\n", " 1.94904730e-02, -3.11479066e-02, -1.54089881e-02, -1.45635931e-02,\n", " 5.81609830e-02, 3.05503402e-02, -5.09663625e-03, -1.68099571e-02,\n", " -3.46655361e-02, -2.07241401e-02, -4.60087508e-02, 2.22174148e-03,\n", " -4.44852635e-02, 1.13986572e-02, 3.25434506e-02, -7.11341351e-02,\n", " -2.52612736e-02, -1.93929002e-02, 1.29123619e-02, -3.82378586e-02,\n", " -1.61821116e-02, 3.58506851e-02, 5.04317284e-02, -4.06110659e-03,\n", " 1.62504464e-02, -7.24662542e-02, 2.72172000e-02, 1.78223494e-02,\n", " 1.37029039e-02, 2.36819778e-02, 2.59892847e-02, 4.08842601e-02,\n", " 6.46982156e-03, -1.15187764e-02, -2.65878811e-02, 9.67550930e-03,\n", " -5.31226061e-02, 8.27150885e-04, 1.87266115e-02, -2.92988610e-03,\n", " 2.49405000e-02, -1.47512518e-02, 1.41259236e-02, 4.05030325e-02,\n", " -1.27288513e-02, -6.76722527e-02, 4.83829528e-02, 4.07700390e-02,\n", " 1.98605303e-02, 3.03293932e-02, 2.63720937e-02, 5.23594357e-02,\n", " -3.34631242e-02, -9.35077388e-03, 1.69177558e-02, 4.34683636e-02,\n", " 3.83584835e-02, 4.47562449e-02, 1.90879237e-02, -1.47993593e-02,\n", " -4.34137182e-03, 1.08403396e-02, 4.95932903e-03, -1.63312294e-02,\n", " -1.97148160e-03, -4.17675115e-02, 6.56750873e-02, -6.79403171e-02,\n", " 4.33396222e-03, -1.69463288e-02, 2.51759049e-02, 2.56763790e-02,\n", " 1.15081819e-03, -1.54735418e-02, -7.34513439e-03, -5.32572754e-02,\n", " 3.48775974e-03, -1.38180405e-02, -5.89082129e-02, 2.92865466e-03,\n", " 5.04819043e-02, 3.35813081e-03, -8.88556316e-02, -3.84001024e-02,\n", " -2.22147852e-02, -2.82134265e-02, -5.42565063e-03, 2.85600629e-02,\n", " -4.50810678e-02, 7.93271931e-04, 8.93172249e-02, 1.16105061e-02,\n", " -3.18008624e-02, 2.66568139e-02, 2.71802712e-02, 1.89267434e-02,\n", " -2.06058472e-02, -3.02396640e-02, -2.85009667e-02, -3.13029774e-02,\n", " -1.97963081e-02, -6.64772978e-03, -1.67978276e-02, -1.99807789e-02,\n", " -5.34575805e-02, -4.45691025e-04, -2.96823028e-02, 4.37867381e-02,\n", " -1.00504002e-02, -1.09997410e-02, 2.21026279e-02, -3.46548930e-02,\n", " 2.11835746e-02, 1.56232687e-02, 2.64695361e-02, -2.34831516e-02,\n", " 2.44938117e-02, 4.38888483e-02, 7.06146704e-03, -1.21593354e-02,\n", " 1.87902339e-02, -6.91951020e-03, 1.30047482e-02, 1.27161061e-02,\n", " -2.23635416e-03, 2.10030675e-02, 6.99695721e-02, 3.16451974e-02,\n", " -2.35833172e-02, 4.63551981e-03, -6.60421327e-03, -3.26112323e-02,\n", " 7.33723640e-02, -8.55090991e-02, -4.05770428e-02, 2.57487390e-02,\n", " -5.12588657e-02, 6.70370162e-02, -1.24506066e-02, -8.91838446e-02,\n", " 5.48583232e-02, -4.54949923e-02, -3.45565155e-02, -7.73074850e-02,\n", " 4.11657728e-02, -4.30639535e-02, 2.70703062e-02, 3.20439860e-02,\n", " -4.77702059e-02, -1.56586170e-02, 1.72707047e-02, -5.23967221e-02,\n", " -7.17989579e-02, 4.30010585e-03, 7.25752689e-05, 5.29346913e-02,\n", " -3.14253271e-02, 1.90561004e-02, 2.56206449e-02, 7.90969878e-02,\n", " 2.13809479e-02, -2.44315583e-02, -1.42572904e-02, 3.64166945e-02,\n", " 4.89886813e-02, 2.09607687e-02, -4.13178988e-02, 4.09340709e-02,\n", " -1.21459812e-01, 6.68874681e-02, -3.10818490e-04, -6.26350287e-03,\n", " 3.98600399e-02, 5.48594780e-02, -1.15817979e-01, 5.31956479e-02,\n", " -4.32696566e-02, 4.33376320e-02, 2.90152542e-02, -8.89771730e-02,\n", " -2.17882283e-02, 1.54281259e-02, 3.62119824e-02, -9.14942175e-02,\n", " -3.93840950e-03, -3.60860415e-02, -7.26094889e-03, 8.90838169e-03,\n", " -5.32953665e-02, 3.61572951e-02, 8.56206343e-02, -1.42062111e-02,\n", " 2.44349018e-02, 1.94476324e-03, 5.81037812e-02, -2.16630846e-02,\n", " 2.43256316e-02, -2.31526350e-03, -1.37544121e-03, 5.89746274e-02,\n", " 8.06517340e-03, -5.71867311e-03, 2.14907620e-02, -2.76927166e-02,\n", " -3.97252664e-02, 3.13828029e-02, 1.40225887e-03, 5.06811216e-02],\n", " dtype=float32)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.encode(\"I just discovered the course. Can I still join it?\")\n", "\n", "#Answer is 0.07" ] }, { "cell_type": "markdown", "id": "a9b03385-af28-4b36-82f3-91eed36da1ab", "metadata": {}, "source": [ "Prepare the documents" ] }, { "cell_type": "code", "execution_count": 3, "id": "c8d79333-4725-4099-8811-69d1ac02f9c0", "metadata": {}, "outputs": [], "source": [ "import requests \n", "\n", "base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'\n", "relative_url = '03-vector-search/eval/documents-with-ids.json'\n", "docs_url = f'{base_url}/{relative_url}?raw=1'\n", "docs_response = requests.get(docs_url)\n", "documents = docs_response.json()" ] }, { "cell_type": "code", "execution_count": 4, "id": "71832a27-f570-4d2e-8451-d23e8b316f09", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'text': 'Machine Learning Zoomcamp FAQ\\nThe purpose of this document is to capture frequently asked technical questions.\\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\\nData Engineering Zoomcamp FAQ\\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\\nwork', 'section': 'General course-related questions', 'question': 'How do I sign up?', 'course': 'machine-learning-zoomcamp', 'id': '0227b872'}\n", "{'text': 'The course videos are pre-recorded, you can start watching the course right now.\\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.', 'section': 'General course-related questions', 'question': 'Is it going to be live? When?', 'course': 'machine-learning-zoomcamp', 'id': '39fda9f0'}\n", "{'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.', 'section': 'General course-related questions', 'question': 'What if I miss a session?', 'course': 'machine-learning-zoomcamp', 'id': '5170565b'}\n", "{'text': \"The bare minimum. The focus is more on practice, and we'll cover the theory only on the intuitive level.: https://mlbookcamp.com/article/python\\nFor example, we won't derive the gradient update rule for logistic regression (there are other great courses for that), but we'll cover how to use logistic regression and make sense of the results.\", 'section': 'General course-related questions', 'question': 'How much theory will you cover?', 'course': 'machine-learning-zoomcamp', 'id': 'ecca790c'}\n", "{'text': \"Yes! We'll cover some linear algebra in the course, but in general, there will be very few formulas, mostly code.\\nHere are some interesting videos covering linear algebra that you can already watch: ML Zoomcamp 1.8 - Linear Algebra Refresher from Alexey Grigorev or the excellent playlist from 3Blue1Brown Vectors | Chapter 1, Essence of linear algebra. Never hesitate to ask the community for help if you have any question.\\n(Mélanie Fouesnard)\", 'section': 'General course-related questions', 'question': \"I don't know math. Can I take the course?\", 'course': 'machine-learning-zoomcamp', 'id': 'c25b3de4'}\n", "{'text': \"The process is automated now, so you should receive the email eventually. If you haven’t, check your promotions tab in Gmail as well as spam.\\nIf you unsubscribed from our newsletter, you won't get course related updates too.\\nBut don't worry, it’s not a problem. To make sure you don’t miss anything, join the #course-ml-zoomcamp channel in Slack and our telegram channel with announcements. This is enough to follow the course.\", 'section': 'General course-related questions', 'question': \"I filled the form, but haven't received a confirmation email. Is it normal?\", 'course': 'machine-learning-zoomcamp', 'id': '6ba259b1'}\n", "{'text': 'Approximately 4 months, but may take more if you want to do some extra activities (an extra project, an article, etc)', 'section': 'General course-related questions', 'question': 'How long is the course?', 'course': 'machine-learning-zoomcamp', 'id': '67e2fd13'}\n", "{'text': 'Around ~10 hours per week. Timur Kamaliev did a detailed analysis of how much time students of the previous cohort needed to spend on different modules and projects. Full article', 'section': 'General course-related questions', 'question': 'How much time do I need for this course?', 'course': 'machine-learning-zoomcamp', 'id': 'a6897e8c'}\n", "{'text': 'Yes, if you finish at least 2 out of 3 projects and review 3 peers’ Projects by the deadline, you will get a certificate. This is what it looks like: link. There’s also a version without a robot: link.', 'section': 'General course-related questions', 'question': 'Will I get a certificate?', 'course': 'machine-learning-zoomcamp', 'id': '2eba08e3'}\n", "{'text': \"Yes, it's possible. See the previous answer.\", 'section': 'General course-related questions', 'question': 'Will I get a certificate if I missed the midterm project?', 'course': 'machine-learning-zoomcamp', 'id': '1d644223'}\n", "{'text': 'Check this article. If you know everything in this article, you know enough. If you don’t, read the article and join the coursIntroduction to Pythone too :)\\nIntroduction to Python – Machine Learning Bookcamp\\nYou can follow this English course from the OpenClassrooms e-learning platform, which is free and covers the python basics for data analysis: Learn Python Basics for Data Analysis - OpenClassrooms . It is important to know some basics such as: how to run a Jupyter notebook, how to import libraries (and what libraries are), how to declare a variable (and what variables are) and some important operations regarding data analysis.\\n(Mélanie Fouesnard)', 'section': 'General course-related questions', 'question': 'How much Python should I know?', 'course': 'machine-learning-zoomcamp', 'id': '14890cd2'}\n", "{'text': 'For the Machine Learning part, all you need is a working laptop with an internet connection. The Deep Learning part is more resource intensive, but for that you can use a cloud (we use Saturn cloud but can be anything else).\\n(Rileen Sinha; based on response by Alexey on Slack)', 'section': 'General course-related questions', 'question': \"Any particular hardware requirements for the course or everything is mostly cloud? TIA! Couldn't really find this in the FAQ.\", 'course': 'machine-learning-zoomcamp', 'id': 'a4fad482'}\n", "{'text': 'Here is an article that worked for me: https://knowmledge.com/2023/12/07/ml-zoomcamp-2023-project/', 'section': 'General course-related questions', 'question': 'How to setup TensorFlow with GPU support on Ubuntu?', 'course': 'machine-learning-zoomcamp', 'id': '34b7fd35'}\n", "{'text': \"Here’s how you join a in Slack: https://slack.com/help/articles/205239967-Join-a-channel\\nClick “All channels” at the top of your left sidebar. If you don't see this option, click “More” to find it.\\nBrowse the list of public channels in your workspace, or use the search bar to search by channel name or description.\\nSelect a channel from the list to view it.\\nClick Join Channel.\\nDo we need to provide the GitHub link to only our code corresponding to the homework questions?\\nYes. You are required to provide the URL to your repo in order to receive a grade\", 'section': 'General course-related questions', 'question': 'I’m new to Slack and can’t find the course channel. Where is it?', 'course': 'machine-learning-zoomcamp', 'id': '4930aa19'}\n", "{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', 'section': 'General course-related questions', 'question': 'The course has already started. Can I still join it?', 'course': 'machine-learning-zoomcamp', 'id': 'ee58a693'}\n", "{'text': 'The course is available in the self-paced mode too, so you can go through the materials at any time. But if you want to do it as a cohort with other students, the next iterations will happen in September 2023, September 2024 (and potentially other Septembers as well).', 'section': 'General course-related questions', 'question': 'When does the next iteration start?', 'course': 'machine-learning-zoomcamp', 'id': '636f55d5'}\n", "{'text': 'No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.', 'section': 'General course-related questions', 'question': 'Can I submit the homework after the due date?', 'course': 'machine-learning-zoomcamp', 'id': 'c839b764'}\n", "{'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\\nOr you can just use this link: http://mlzoomcamp.com/#syllabus', 'section': 'General course-related questions', 'question': 'I just joined. What should I do next? How can I access course materials?', 'course': 'machine-learning-zoomcamp', 'id': '0a278fb2'}\n", "{'text': 'For the 2023 cohort, you can see the deadlines here (it’s taken from the 2023 cohort page)', 'section': 'General course-related questions', 'question': 'What are the deadlines in this course?', 'course': 'machine-learning-zoomcamp', 'id': '8de4fefd'}\n", "{'text': 'There’s not much difference. There was one special module (BentoML) in the previous iteration of the course, but the rest of the modules are the same as in 2022. The homework this year is different.', 'section': 'General course-related questions', 'question': 'What’s the difference between the previous iteration of the course (2022) and this one (2023)?', 'course': 'machine-learning-zoomcamp', 'id': '94e86808'}\n", "{'text': 'We won’t re-record the course videos. The focus of the course and the skills we want to teach remained the same, and the videos are still up-to-date.\\nIf you haven’t taken part in the previous iteration, you can start watching the videos. It’ll be useful for you and you will learn new things. However, we recommend using Python 3.10 now instead of Python 3.8.', 'section': 'General course-related questions', 'question': 'The course videos are from the previous iteration. Will you release new ones or we’ll use the videos from 2021?', 'course': 'machine-learning-zoomcamp', 'id': 'e7ba6b8a'}\n", "{'text': 'When you post about what you learned from the course on your social media pages, use the tag #mlzoomcamp. When you submit your homework, there’s a section in the form for putting the links there. Separate multiple links by any whitespace character (linebreak, space, tab, etc).\\nFor posting the learning in public links, you get extra scores. But the number of scores is limited to 7 points: if you put more than 7 links in your homework form, you’ll get only 7 points.\\nThe same content can be posted to 7 different social sites and still earn you 7 points if you add 7 URLs per week, see Alexey’s reply. (~ ellacharmed)\\nFor midterms/capstones, the awarded points are doubled as the duration is longer. So for projects the points are capped at 14 for 14 URLs.', 'section': 'General course-related questions', 'question': 'Submitting learning in public links', 'course': 'machine-learning-zoomcamp', 'id': 'f7bc2f65'}\n", "{'text': \"You can create your own github repository for the course with your notes, homework, projects, etc.\\nThen fork the original course repo and add a link under the 'Community Notes' section to the notes that are in your own repo.\\nAfter that's done, create a pull request to sync your fork with the original course repo.\\n(By Wesley Barreto)\", 'section': 'General course-related questions', 'question': 'Adding community notes', 'course': 'machine-learning-zoomcamp', 'id': 'ae52a907'}\n", "{'text': \"Leaderboard Links:\\n2023 - https://docs.google.com/spreadsheets/d/e/2PACX-1vSNK_yGtELX1RJK1SSRl4xiUbD0XZMYS6uwHnybc7Mql-WMnMgO7hHSu59w-1cE7FeFZjkopbh684UE/pubhtml\\n2022 - https://docs.google.com/spreadsheets/d/e/2PACX-1vQzLGpva63gb2rIilFnpZMRSb-buyr5oGh8jmDtIb8DANo4n6hDalra_WRCl4EZwO1JvaC4UIS62n5h/pubhtml\\nPython Code:\\nfrom hashlib import sha1\\ndef compute_hash(email):\\nreturn sha1(email.lower().encode('utf-8')).hexdigest()\\nYou need to call the function as follows:\\nprint(compute_hash('YOUR_EMAIL_HERE'))\\nThe quotes are required to denote that your email is a string.\\n(By Wesley Barreto)\\nYou can also use this website directly by entering your email: http://www.sha1-online.com. Then, you just have to copy and paste your hashed email in the “research” bar of the leaderboard to get your scores.\\n(Mélanie Fouesnard)\", 'section': '1. Introduction to Machine Learning', 'question': 'Computing the hash for the leaderboard and project review', 'course': 'machine-learning-zoomcamp', 'id': 'dab5a24a'}\n", "{'text': 'If you get “wget is not recognized as an internal or external command”, you need to install it.\\nOn Ubuntu, run\\nsudo apt-get install wget\\nOn Windows, the easiest way to install wget is to use Chocolatey:\\nchoco install wget\\nOr you can download a binary from here and put it to any location in your PATH (e.g. C:/tools/)\\nOn Mac, the easiest way to install wget is to use brew.\\nBrew install wget\\nAlternatively, you can use a Python wget library, but instead of simply using “wget” you’ll need eeeto use\\npython -m wget\\nYou need to install it with pip first:\\npip install wget\\nAnd then in your python code, for example in your jupyter notebook, use:\\nimport wget\\nwget.download(\"URL\")\\nThis should download whatever is at the URL in the same directory as your code.\\n(Memoona Tahira)\\nAlternatively, you can read a CSV file from a URL directly with pandas:\\nurl = \"https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\"\\ndf = pd.read_csv(url)\\nValid URL schemes include http, ftp, s3, gs, and file.\\nIn some cases you might need to bypass https checks:\\nimport ssl\\nssl._create_default_https_context = ssl._create_unverified_context\\nOr you can use the built-in Python functionality for downloading the files:\\nimport urllib.request\\nurl = \"https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\"\\nurllib.request.urlretrieve(url, \"housing.csv\")\\nUrllib.request.urlretrieve() is a standard Python library function available on all devices and platforms. URL requests and URL data retrieval are done with the urllib.request module.\\nThe urlretrieve() function allows you to download files from URLs and save them locally. Python programs use it to download files from the internet.\\nOn any Python-enabled device or platform, you can use the urllib.request.urlretrieve() function to download the file.\\n(Mohammad Emad Sharifi)', 'section': '1. Introduction to Machine Learning', 'question': 'wget is not recognized as an internal or external command', 'course': 'machine-learning-zoomcamp', 'id': '49f9bda9'}\n", "{'text': 'You can use\\n!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\\nTo download the data too. The exclamation mark !, lets you execute shell commands inside your notebooks. This works generally for shell commands such as ls, cp, mkdir, mv etc . . .\\nFor instance, if you then want to move your data into a data directory alongside your notebook-containing directory, you could execute the following:\\n!mkdir -p ../data/\\n!mv housing.csv ../data/', 'section': '1. Introduction to Machine Learning', 'question': 'Retrieving csv inside notebook', 'course': 'machine-learning-zoomcamp', 'id': 'd44de7d1'}\n", "{'text': '(Tyler Simpson)', 'section': '1. Introduction to Machine Learning', 'question': 'Windows WSL and VS Code\\nIf you have a Windows 11 device and would like to use the built in WSL to access linux you can use the Microsoft Learn link Set up a WSL development environment | Microsoft Learn. To connect this to VS Code download the Microsoft verified VS Code extension ‘WSL’ this will allow you to remotely connect to your WSL Ubuntu instance as if it was a virtual machine.', 'course': 'machine-learning-zoomcamp', 'id': '314ebe32'}\n", "{'text': 'This is my first time using Github to upload a code. I was getting the below error message when I type\\ngit push -u origin master:\\nerror: src refspec master does not match any\\nerror: failed to push some refs to \\'https://github.com/XXXXXX/1st-Homework.git\\'\\nSolution:\\nThe error message got fixed by running below commands:\\ngit commit -m \"initial commit\"\\ngit push origin main\\nIf this is your first time to use Github, you will find a great & straightforward tutorial in this link https://dennisivy.com/github-quickstart\\n(Asia Saeed)\\nYou can also use the “upload file” functionality from GitHub for that\\nIf you write your code on Google colab you can also directly share it on your Github.\\n(By Pranab Sarma)', 'section': '1. Introduction to Machine Learning', 'question': 'Uploading the homework to Github', 'course': 'machine-learning-zoomcamp', 'id': '98cff602'}\n", "{'text': \"I'm trying to invert the matrix but I got error that the matrix is singular matrix\\nThe singular matrix error is caused by the fact that not every matrix can be inverted. In particular, in the homework it happens because you have to pay close attention when dealing with multiplication (the method .dot) since multiplication is not commutative! X.dot(Y) is not necessarily equal to Y.dot(X), so respect the order otherwise you get the wrong matrix.\", 'section': '1. Introduction to Machine Learning', 'question': 'Singular Matrix Error', 'course': 'machine-learning-zoomcamp', 'id': '54ec0de4'}\n", "{'text': 'I have a problem with my terminal. Command\\nconda create -n ml-zoomcamp python=3.9\\ndoesn’t work. Any of 3.8/ 3.9 / 3.10 should be all fine\\nIf you’re on Windows and just installed Anaconda, you can use Anaconda’s own terminal called “Anaconda Prompt”.\\nIf you don’t have Anaconda or Miniconda, you should install it first\\n(Tatyana Mardvilko)', 'section': '1. Introduction to Machine Learning', 'question': 'Conda is not an internal command', 'course': 'machine-learning-zoomcamp', 'id': 'f81f4ecb'}\n", "{'text': 'How do I read the dataset with Pandas in Windows?\\nI used the code below but not working\\ndf = pd.read_csv(\\'C:\\\\Users\\\\username\\\\Downloads\\\\data.csv\\')\\nUnlike Linux/Mac OS, Windows uses the backslash (\\\\) to navigate the files that cause the conflict with Python. The problem with using the backslash is that in Python, the \\'\\\\\\' has a purpose known as an escape sequence. Escape sequences allow us to include special characters in strings, for example, \"\\\\n\" to add a new line or \"\\\\t\" to add spaces, etc. To avoid the issue we just need to add \"r\" before the file path and Python will treat it as a literal string (not an escape sequence).\\nHere’s how we should be loading the file instead:\\ndf = pd.read_csv(r\\'C:\\\\Users\\\\username\\\\Downloads\\\\data.csv\\')\\n(Muhammad Awon)', 'section': '1. Introduction to Machine Learning', 'question': 'Read-in the File in Windows OS', 'course': 'machine-learning-zoomcamp', 'id': 'be760b92'}\n", "{'text': 'Type the following command:\\ngit config -l | grep url\\nThe output should look like this:\\nremote.origin.url=https://github.com/github-username/github-repository-name.git\\nChange this to the following format and make sure the change is reflected using command in step 1:\\ngit remote set-url origin \"https://github-username@github.com/github-username/github-repository-name.git\"\\n(Added by Dheeraj Karra)', 'section': '1. Introduction to Machine Learning', 'question': \"'403 Forbidden' error message when you try to push to a GitHub repository\", 'course': 'machine-learning-zoomcamp', 'id': 'a2cfa1c9'}\n", "{'text': \"I had a problem when I tried to push my code from Git Bash:\\nremote: Support for password authentication was removed on August 13, 2021.\\nremote: Please see https://docs.github.com/en/get-started/getting-started-with-git/about-remote-repositories#cloning-with-https-urls for information on currently recommended modes of authentication.\\nfatal: Authentication failed for 'https://github.com/username\\nSolution:\\nCreate a personal access token from your github account and use it when you make a push of your last changes.\\nhttps://docs.github.com/en/authentication/connecting-to-github-with-ssh/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent\\nBruno Bedón\", 'section': '1. Introduction to Machine Learning', 'question': \"Fatal: Authentication failed for 'https://github.com/username\", 'course': 'machine-learning-zoomcamp', 'id': '7b907071'}\n", "{'text': \"In Kaggle, when you are trying to !wget a dataset from github (or any other public repository/location), you get the following error:\\nGetting this error while trying to import data- !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\\n--2022-09-17 16:55:24-- https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\\nResolving raw.githubusercontent.com (raw.githubusercontent.com)... failed: Temporary failure in name resolution.\\nwget: unable to resolve host address 'raw.githubusercontent.com'\\nSolution:\\nIn your Kaggle notebook settings, turn on the Internet for your session. It's on the settings panel, on the right hand side of the Kaggle screen. You'll be asked to verify your phone number so Kaggle knows you are not a bot.\", 'section': '1. Introduction to Machine Learning', 'question': \"wget: unable to resolve host address 'raw.githubusercontent.com'\", 'course': 'machine-learning-zoomcamp', 'id': 'fc2e0a61'}\n", "{'text': 'I found this video quite helpful: Creating Virtual Environment for Python from VS Code\\n[Native Jupiter Notebooks support in VS Code] In VS Code you can also have a native Jupiter Notebooks support, i.e. you do not need to open a web browser to code in a Notebook. If you have port forwarding enabled + run a ‘jupyter notebook ‘ command from a remote machine + have a remote connection configured in .ssh/config (as Alexey’s video suggests) - VS Code can execute remote Jupyter Notebooks files on a remote server from your local machine: https://code.visualstudio.com/docs/datascience/jupyter-notebooks.\\n[Git support from VS Code] You can work with Github from VSCode - staging and commits are easy from the VS Code’s UI: https://code.visualstudio.com/docs/sourcecontrol/overview\\n(Added by Ivan Brigida)', 'section': '1. Introduction to Machine Learning', 'question': 'Setting up an environment using VS Code', 'course': 'machine-learning-zoomcamp', 'id': 'd43e5742'}\n", "{'text': 'With regards to creating an environment for the project, do we need to run the command \"conda create -n .......\" and \"conda activate ml-zoomcamp\" everytime we open vs code to work on the project?\\nAnswer:\\n\"conda create -n ....\" is just run the first time to create the environment. Once created, you just need to run \"conda activate ml-zoomcamp\" whenever you want to use it.\\n(Added by Wesley Barreto)\\nconda env export > environment.yml will also allow you to reproduce your existing environment in a YAML file. You can then recreate it with conda env create -f environment.yml', 'section': '1. Introduction to Machine Learning', 'question': 'Conda Environment Setup', 'course': 'machine-learning-zoomcamp', 'id': '32bc0538'}\n", "{'text': \"I was doing Question 7 from Week1 Homework and with step6: Invert XTX, I created the inverse. Now, an inverse when multiplied by the original matrix should return in an Identity matrix. But when I multiplied the inverse with the original matrix, it gave a matrix like this:\\nInverse * Original:\\n[[ 1.00000000e+00 -1.38777878e-16]\\n[ 3.16968674e-13 1.00000000e+00]]\\nSolution:\\nIt's because floating point math doesn't work well on computers as shown here: https://stackoverflow.com/questions/588004/is-floating-point-math-broken\\n(Added by Wesley Barreto)\", 'section': '1. Introduction to Machine Learning', 'question': 'Floating Point Precision', 'course': 'machine-learning-zoomcamp', 'id': 'b6730228'}\n", "{'text': 'Answer:\\nIt prints the information about the dataset like:\\nIndex datatype\\nNo. of entries\\nColumn information with not-null count and datatype\\nMemory usage by dataset\\nWe use it as:\\ndf.info()\\n(Added by Aadarsha Shrestha & Emoghena Itakpe)', 'section': '1. Introduction to Machine Learning', 'question': 'What does pandas.DataFrame.info() do?', 'course': 'machine-learning-zoomcamp', 'id': '3ce9bbb8'}\n", "{'text': \"Pandas and numpy libraries are not being imported\\nNameError: name 'np' is not defined\\nNameError: name 'pd' is not defined\\nIf you're using numpy or pandas, make sure you use the first few lines before anything else.\\nimport pandas as pd\\nimport numpy as np\\nAdded by Manuel Alejandro Aponte\", 'section': '1. Introduction to Machine Learning', 'question': \"NameError: name 'np' is not defined\", 'course': 'machine-learning-zoomcamp', 'id': '4e584d06'}\n", "{'text': \"What if there were hundreds of columns? How do you get the columns only with numeric or object data in a more concise way?\\ndf.select_dtypes(include=np.number).columns.tolist()\\ndf.select_dtypes(include='object').columns.tolist()\\nAdded by Gregory Morris\", 'section': '1. Introduction to Machine Learning', 'question': 'How to select column by dtype', 'course': 'machine-learning-zoomcamp', 'id': 'ff4da2b6'}\n", "{'text': 'There are many ways to identify the shape of dataset, one of them is using .shape attribute!\\ndf.shape\\ndf.shape[0] # for identify the number of rows\\ndf.shape[1] # for identify the number of columns\\nAdded by Radikal Lukafiardi', 'section': '1. Introduction to Machine Learning', 'question': 'How to identify the shape of dataset in Pandas', 'course': 'machine-learning-zoomcamp', 'id': '58c1c168'}\n", "{'text': 'First of all use np.dot for matrix multiplication. When you compute matrix-matrix multiplication you should understand that order of multiplying is crucial and affects the result of the multiplication!\\nDimension Mismatch\\nTo perform matrix multiplication, the number of columns in the 1st matrix should match the number of rows in the 2nd matrix. You can rearrange the order to make sure that this satisfies the condition.\\nAdded by Leah Gotladera', 'section': '1. Introduction to Machine Learning', 'question': 'How to avoid Value errors with array shapes in homework?', 'course': 'machine-learning-zoomcamp', 'id': '96076a1a'}\n", "{'text': 'You would first get the average of the column and save it to a variable, then replace the NaN values with the average variable.\\nThis method is called imputing - when you have NaN/ null values in a column, but you do not want to get rid of the row because it has valuable information contributing to other columns.\\nAdded by Anneysha Sarkar', 'section': '1. Introduction to Machine Learning', 'question': 'Question 5: How and why do we replace the NaN values with average of the column?', 'course': 'machine-learning-zoomcamp', 'id': '3218389a'}\n", "{'text': 'In Question 7 we are asked to calculate\\nThe initial problem can be solved by this, where a Matrix X is multiplied by some unknown weights w resulting in the target y.\\nAdditional reading and videos:\\nOrdinary least squares\\nMultiple Linear Regression in Matrix Form\\nPseudoinverse Solution to OLS\\nAdded by Sylvia Schmitt\\nwith commends from Dmytro Durach', 'section': '1. Introduction to Machine Learning', 'question': 'Question 7: Mathematical formula for linear regression', 'course': 'machine-learning-zoomcamp', 'id': '183a1c90'}\n", "{'text': 'This is most likely that you interchanged the first step of the multiplication\\nYou used instead of\\nAdded by Emmanuel Ikpesu', 'section': '1. Introduction to Machine Learning', 'question': 'Question 7: FINAL MULTIPLICATION not having 5 column', 'course': 'machine-learning-zoomcamp', 'id': 'f0bc1c19'}\n", "{'text': 'Note, that matrix multiplication (matrix-matrix, matrix-vector multiplication) can be written as * operator in some sources, but performed as @ operator or np.matmul() via numpy. * operator performs element-wise multiplication (Hadamard product).\\nnumpy.dot() or ndarray.dot() can be used, but for matrix-matrix multiplication @ or np.matmul() is preferred (as per numpy doc).\\nIf multiplying by a scalar numpy.multiply() or * is preferred.\\nAdded by Andrii Larkin', 'section': '1. Introduction to Machine Learning', 'question': 'Question 7: Multiplication operators.', 'course': 'machine-learning-zoomcamp', 'id': '735e6c78'}\n", "{'text': 'If you face an error kind of ImportError: cannot import name \\'contextfilter\\' from \\'jinja2\\' (anaconda\\\\lib\\\\site-packages\\\\jinja2\\\\__init__.py) when launching a new notebook for a brand new environment.\\nSwitch to the main environment and run \"pip install nbconvert --upgrade\".\\nAdded by George Chizhmak', 'section': '1. Introduction to Machine Learning', 'question': 'Error launching Jupyter notebook', 'course': 'machine-learning-zoomcamp', 'id': 'b8ca1cd3'}\n", "{'text': 'If you face this situation and see IPv6 addresses in the terminal, go to your System Settings > Network > your network connection > Details > Configure IPv6 > set to Manually > OK. Then try again', 'section': '1. Introduction to Machine Learning', 'question': 'wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv hangs on MacOS Ventura M1', 'course': 'machine-learning-zoomcamp', 'id': 'efdb235f'}\n", "{'text': \"Wget doesn't ship with macOS, so there are other alternatives to use.\\nNo worries, we got curl:\\nexample:\\ncurl -o ./housing.csv https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\\nExplanations:\\ncurl: a utility for retrieving information from the internet.\\n-o: Tell it to store the result as a file.\\nfilename: You choose the file's name.\\nLinks: Put the web address (URL) here, and cURL will extract data from it and save it under the name you provide.\\nMore about it at:\\nCurl Documentation\\nAdded by David Espejo\", 'section': '1. Introduction to Machine Learning', 'question': 'In case you are using mac os and having trouble with WGET', 'course': 'machine-learning-zoomcamp', 'id': '355348f0'}\n", "{'text': \"You can use round() function or f-strings\\nround(number, 4) - this will round number up to 4 decimal places\\nprint(f'Average mark for the Homework is {avg:.3f}') - using F string\\nAlso there is pandas.Series. round idf you need to round values in the whole Series\\nPlease check the documentation\\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.round.html#pandas.Series.round\\nAdded by Olga Rudakova\", 'section': '2. Machine Learning for Regression', 'question': 'How to output only a certain number of decimal places', 'course': 'machine-learning-zoomcamp', 'id': '67afabf5'}\n", "{'text': 'Here are the crucial links for this Week 2 that starts September 18, 2023\\nAsk questions for Live Sessions: https://app.sli.do/event/vsUpjYsayZ8A875Hq8dpUa/live/questions\\nCalendar for weekly meetings: https://calendar.google.com/calendar/u/0/r?cid=cGtjZ2tkbGc1OG9yb2lxa2Vwc2g4YXMzMmNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ&pli=1\\nWeek 2 HW: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/02-regression/homework.md\\nSubmit HW Week 2: https://docs.google.com/forms/d/e/1FAIpQLSf8eMtnErPFqzzFsEdLap_GZ2sMih-H-Y7F_IuPGqt4fOmOJw/viewform (also available at the bottom of the above link)\\nAll HWs: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/\\nGitHub for theory: https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/course-zoomcamp\\nYoutube Link: 2.X --- https://www.youtube.com/watch?v=vM3SqPNlStE&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=12\\nFAQs: https://docs.google.com/document/d/1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8/edit#heading=h.lpz96zg7l47j\\n~~Nukta Bhatia~~', 'section': '2. Machine Learning for Regression', 'question': 'How do I get started with Week 2?', 'course': 'machine-learning-zoomcamp', 'id': '50d737e7'}\n", "{'text': 'We can use histogram:\\nimport pandas as pd\\nimport matplotlib.pyplot as plt\\nimport seaborn as sns\\n# Load the data\\nurl = \\'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\\'\\ndf = pd.read_csv(url)\\n# EDA\\nsns.histplot(df[\\'median_house_value\\'], kde=False)\\nplt.show()\\nOR ceck skewness and describe:\\nprint(df[\\'median_house_value\\'].describe())\\n# Calculate the skewness of the \\'median_house_value\\' variable\\nskewness = df[\\'median_house_value\\'].skew()\\n# Print the skewness value\\nprint(\"Skewness of \\'median_house_value\\':\", skewness)\\n(Mohammad Emad Sharifi)', 'section': '2. Machine Learning for Regression', 'question': 'Checking long tail of data', 'course': 'machine-learning-zoomcamp', 'id': 'bbc0fca3'}\n", "{'text': 'It’s possible that when you follow the videos, you’ll get a Singular Matrix error. We will explain why it happens in the Regularization video. Don’t worry, it’s normal that you have it.\\nYou can also have an error because you did the inverse of X once in your code and you’re doing it a second time.\\n(Added by Cécile Guillot)', 'section': '2. Machine Learning for Regression', 'question': 'LinAlgError: Singular matrix', 'course': 'machine-learning-zoomcamp', 'id': '6f3bdd20'}\n", "{'text': 'You can find a detailed description of the dataset ere https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html\\nKS', 'section': '2. Machine Learning for Regression', 'question': 'California housing dataset', 'course': 'machine-learning-zoomcamp', 'id': '27c2d90a'}\n", "{'text': 'I was using for loops to apply rmse to list of y_val and y_pred. But the resulting rmse is all nan.\\nI found out that the problem was when my data reached the mean step after squaring the error in the rmse function. Turned out there were nan in the array, then I traced the problem back to where I first started to split the data: I had only use fillna(0) on the train data, not on the validation and test data. So the problem was fixed after I applied fillna(0) to all the dataset (train, val, test). Voila, my for loops to get rmse from all the seed values work now.\\nAdded by Sasmito Yudha Husada', 'section': '2. Machine Learning for Regression', 'question': 'Getting NaNs after applying .mean()', 'course': 'machine-learning-zoomcamp', 'id': '88e9600a'}\n", "{'text': 'Why should we transform the target variable to logarithm distribution? Do we do this for all machine learning projects?\\nOnly if you see that your target is highly skewed. The easiest way to evaluate this is by plotting the distribution of the target variable.\\nThis can help to understand skewness and how it can be applied to the distribution of your data set.\\nhttps://en.wikipedia.org/wiki/Skewness\\nPastor Soto', 'section': '2. Machine Learning for Regression', 'question': 'Target variable transformation', 'course': 'machine-learning-zoomcamp', 'id': 'd59d8df7'}\n", "{'text': 'The dataset can be read directly to pandas dataframe from the github link using the technique shown below\\ndfh=pd.read_csv(\"https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\")\\nKrishna Anand', 'section': '2. Machine Learning for Regression', 'question': 'Reading the dataset directly from github', 'course': 'machine-learning-zoomcamp', 'id': '0b3eaf92'}\n", "{'text': \"For users of kaggle notebooks, the dataset can be loaded through widget using the below command. Please remember that ! before wget is essential\\n!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\\nOnce the dataset is loaded to the kaggle notebook server, it can be read through the below pandas command\\ndf = pd.read_csv('housing.csv')\\nHarish Balasundaram\", 'section': '2. Machine Learning for Regression', 'question': 'Loading the dataset directly through Kaggle Notebooks', 'course': 'machine-learning-zoomcamp', 'id': '8fe56032'}\n", "{'text': 'We can filter a dataset by using its values as below.\\ndf = df[(df[\"ocean_proximity\"] == \"<1H OCEAN\") | (df[\"ocean_proximity\"] == \"INLAND\")]\\nYou can use | for ‘OR’, and & for ‘AND’\\nAlternative:\\ndf = df[df[\\'ocean_proximity\\'].isin([\\'<1H OCEAN\\', \\'INLAND\\'])]\\nRadikal Lukafiardi', 'section': '2. Machine Learning for Regression', 'question': 'Filter a dataset by using its values', 'course': 'machine-learning-zoomcamp', 'id': 'af833e0a'}\n", "{'text': 'Above users showed how to load the dataset directly from github. Here is another useful way of doing this using the `requests` library:\\n# Get data for homework\\nimport requests\\nurl = \\'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv\\'\\nresponse = requests.get(url)\\nif response.status_code == 200:\\nwith open(\\'housing.csv\\', \\'wb\\') as file:\\nfile.write(response.content)\\nelse:\\nprint(\"Download failed.\")\\nTyler Simpson', 'section': '2. Machine Learning for Regression', 'question': 'Alternative way to load the data using requests', 'course': 'machine-learning-zoomcamp', 'id': '8d209d6d'}\n", "{'text': 'When creating a duplicate of your dataframe by doing the following:\\nX_train = df_train\\nX_val = df_val\\nYou’re still referencing the original variable, this is called a shallow copy. You can make sure that no references are attaching both variables and still keep the copy of the data do the following to create a deep copy:\\nX_train = df_train.copy()\\nX_val = df_val.copy()\\nAdded by Ixchel García', 'section': '2. Machine Learning for Regression', 'question': 'Null column is appearing even if I applied .fillna()', 'course': 'machine-learning-zoomcamp', 'id': '0bc4c3da'}\n", "{'text': 'Yes, you can. Here we implement it ourselves to better understand how it works, but later we will only rely on Scikit-Learn’s functions. If you want to start using it earlier — feel free to do it', 'section': '2. Machine Learning for Regression', 'question': 'Can I use Scikit-Learn’s train_test_split for this week?', 'course': 'machine-learning-zoomcamp', 'id': 'c0ee2665'}\n", "{'text': 'Yes, you can. We will also do that next week, so don’t worry, you will learn how to do it.', 'section': '2. Machine Learning for Regression', 'question': 'Can I use LinearRegression from Scikit-Learn for this week?', 'course': 'machine-learning-zoomcamp', 'id': '3f60871d'}\n", "{'text': 'What are equivalents in Scikit-Learn for the linear regression with and without regularization used in week 2.\\nCorresponding function for model without regularization:\\nsklearn.linear_model.LinearRegression\\nCorresponding function for model with regularization:\\nsklearn.linear_model.Ridge\\nThe linear model from Scikit-Learn are explained here:\\nhttps://scikit-learn.org/stable/modules/linear_model.html\\nAdded by Sylvia Schmitt', 'section': '2. Machine Learning for Regression', 'question': 'Corresponding Scikit-Learn functions for Linear Regression (with and without Regularization)', 'course': 'machine-learning-zoomcamp', 'id': 'f30217a7'}\n", "{'text': '`r` is a regularization parameter.\\nIt’s similar to `alpha` in sklearn.Ridge(), as both control the \"strength\" of regularization (increasing both will lead to stronger regularization), but mathematically not quite, here\\'s how both are used:\\nsklearn.Ridge()\\n||y - Xw||^2_2 + alpha * ||w||^2_2\\nlesson’s notebook (`train_linear_regression_reg` function)\\nXTX = XTX + r * np.eye(XTX.shape[0])\\n`r` adds “noise” to the main diagonal to prevent multicollinearity, which “breaks” finding inverse matrix.', 'section': '2. Machine Learning for Regression', 'question': 'Question 4: what is `r`, is it the same as `alpha` in sklearn.Ridge()?', 'course': 'machine-learning-zoomcamp', 'id': '91fc573d'}\n", "{'text': 'Q: “In lesson 2.8 why is y_pred different from y? After all, we trained X_train to get the weights that when multiplied by X_train should give exactly y, or?”\\nA: linear regression is a pretty simple model, it neither can nor should fit 100% (nor any other model, as this would be the sign of overfitting). This picture might illustrate some intuition behind this, imagine X is a single feature:\\nAs our model is linear, how would you draw a line to fit all the \"dots\"?\\nYou could \"fit\" all the \"dots\" on this pic using something like scipy.optimize.curve_fit (non-linear least squares) if you wanted to, but imagine how it would perform on previously unseen data.\\nAdded by Andrii Larkin', 'section': '2. Machine Learning for Regression', 'question': 'Why linear regression doesn’t provide a “perfect” fit?', 'course': 'machine-learning-zoomcamp', 'id': 'fe3139f6'}\n", "{'text': 'One of the questions on the homework calls for using a random seed of 42. When using 42, all my missing values ended up in my training dataframe and not my validation or test dataframes, why is that?\\nThe purpose of the seed value is to randomly generate the proportion split. Using a seed of 42 ensures that all learners are on the same page by getting the same behavior (in this case, all missing values ending up in the training dataframe). If using a different seed value (e.g. 9), missing values will then appear in all other dataframes.', 'section': '2. Machine Learning for Regression', 'question': 'Random seed 42', 'course': 'machine-learning-zoomcamp', 'id': '48aac030'}\n", "{'text': 'It is possible to do the shuffling of the dataset with the pandas built-in function pandas.DataFrame.sample.The complete dataset can be shuffled including resetting the index with the following commands:\\nSetting frac=1 will result in returning a shuffled version of the complete Dataset.\\nSetting random_state=seed will result in the same randomization as used in the course resources.\\ndf_shuffled = df.sample(frac=1, random_state=seed)\\ndf_shuffled.reset_index(drop=True, inplace=True)\\nAdded by Sylvia Schmitt', 'section': '2. Machine Learning for Regression', 'question': 'Shuffling the initial dataset using pandas built-in function', 'course': 'machine-learning-zoomcamp', 'id': '28321bc2'}\n", "{'text': 'That’s normal. We all have different environments: our computers have different versions of OS and different versions of libraries — even different versions of Python.\\nIf it’s the case, just select the option that’s closest to your answer', 'section': '2. Machine Learning for Regression', 'question': \"The answer I get for one of the homework questions doesn't match any of the options. What should I do?\", 'course': 'machine-learning-zoomcamp', 'id': 'edb92d22'}\n", "{'text': \"In question 3 of HW02 it is mentioned: ‘For computing the mean, use the training only’. What does that mean?\\nIt means that you should use only the training data set for computing the mean, not validation or test data set. This is how you can calculate the mean\\ndf_train['column_name'].mean( )\\nAnother option:\\ndf_train[‘column_name’].describe()\\n(Bhaskar Sarma)\", 'section': '2. Machine Learning for Regression', 'question': 'Meaning of mean in homework 2, question 3', 'course': 'machine-learning-zoomcamp', 'id': 'f488ce85'}\n", "{'text': 'When the target variable has a long tail distribution, like in prices, with a wide range, you can transform the target variable with np.log1p() method, but be aware if your target variable has negative values, this method will not work', 'section': '2. Machine Learning for Regression', 'question': 'When should we transform the target variable to logarithm distribution?', 'course': 'machine-learning-zoomcamp', 'id': 'bf395099'}\n", "{'text': 'If we try to perform an arithmetic operation between 2 arrays of different shapes or different dimensions, it throws an error like operands could not be broadcast together with shapes. There are some scenarios when broadcasting can occur and when it fails.\\nIf this happens sometimes we can use * operator instead of dot() method to solve the issue. So that the error is solved and also we get the dot product.\\n(Santhosh Kumar)', 'section': '2. Machine Learning for Regression', 'question': 'ValueError: shapes not aligned', 'course': 'machine-learning-zoomcamp', 'id': '01cd3b35'}\n", "{'text': 'Copy of a dataframe is made with X_copy = X.copy().\\nThis is called creating a deep copy. Otherwise it will keep changing the original dataframe if used like this: X_copy = X.\\nAny changes to X_copy will reflect back to X. This is not a real copy, instead it is a “view”.\\n(Memoona Tahira)', 'section': '2. Machine Learning for Regression', 'question': 'How to copy a dataframe without changing the original dataframe?', 'course': 'machine-learning-zoomcamp', 'id': '5551c92e'}\n", "{'text': 'One of the most important characteristics of the normal distribution is that mean=median=mode, this means that the most popular value, the mean of the distribution and 50% of the sample are under the same value, this is equivalent to say that the area under the curve (black) is the same on the left and on the right. The long tail (red curve) is the result of having a few observations with high values, now the behaviour of the distribution changes, first of all, the area is different on each side and now the mean, median and mode are different. As a consequence, the mean is no longer representative, the range is larger than before and the probability of being on the left or on the right is not the same.\\n(Tatiana Dávila)', 'section': '2. Machine Learning for Regression', 'question': 'What does ‘long tail’ mean?', 'course': 'machine-learning-zoomcamp', 'id': '94f928d2'}\n", "{'text': 'In statistics, the standard deviation is a measure of the amount of variation or dispersion of a set of values. A low standard deviation indicates that the values tend to be close to the mean (also called the expected value) of the set, while a high standard deviation indicates that the values are spread out over a wider range. [Wikipedia] The formula to calculate standard deviation is:\\n(Aadarsha Shrestha)', 'section': '2. Machine Learning for Regression', 'question': 'What is standard deviation?', 'course': 'machine-learning-zoomcamp', 'id': '266faa6d'}\n", "{'text': 'The application of regularization depends on the specific situation and problem. It is recommended to consider it when training machine learning models, especially with small datasets or complex models, to prevent overfitting. However, its necessity varies depending on the data quality and size. Evaluate each case individually to determine if it is needed.\\n(Daniel Muñoz Viveros)', 'section': '2. Machine Learning for Regression', 'question': 'Do we need to apply regularization techniques always? Or only in certain scenarios?', 'course': 'machine-learning-zoomcamp', 'id': 'c21f99f5'}\n", "{'text': 'As it speeds up the development:\\nprepare_df(initial_df, seed, fill_na_type) - that prepared all 3 dataframes and 3 y_vectors. Fillna() can be done before the initial_df is split.\\nOf course, you can reuse other functions: rmse() and train_linear_regression(X,y,r) from the class notebook\\n(Ivan Brigida)', 'section': '2. Machine Learning for Regression', 'question': 'Shortcut: define functions for faster execution', 'course': 'machine-learning-zoomcamp', 'id': '13702957'}\n", "{'text': 'If we have a list or series of data for example x = [1,2,3,4,5]. We can use pandas to find the standard deviation. We can pass our list into panda series and call standard deviation directly on the series pandas.Series(x).std().\\n(Quinn Avila)', 'section': '2. Machine Learning for Regression', 'question': 'How to use pandas to find standard deviation', 'course': 'machine-learning-zoomcamp', 'id': '7cd652c5'}\n", "{'text': 'Numpy and Pandas packages use different equations to compute the standard deviation. Numpy uses population standard deviation, whereas pandas uses sample standard deviation by default.\\nNumpy\\nPandas\\npandas default standard deviation is computed using one degree of freedom. You can change degree in of freedom in NumPy to change this to unbiased estimator by using ddof parameter:\\nimport numpy as np\\nnp.std(df.weight, ddof=1)\\nThe result will be similar if we change the dof = 1 in numpy\\n(Harish Balasundaram)', 'section': '2. Machine Learning for Regression', 'question': 'Standard Deviation Differences in Numpy and Pandas', 'course': 'machine-learning-zoomcamp', 'id': 'e1f93d10'}\n", "{'text': \"In pandas you can use built in Pandas function names std() to get standard deviation. For example\\ndf['column_name'].std() to get standard deviation of that column.\\ndf[['column_1', 'column_2']].std() to get standard deviation of multiple columns.\\n(Khurram Majeed)\", 'section': '2. Machine Learning for Regression', 'question': 'Standard deviation using Pandas built in Function', 'course': 'machine-learning-zoomcamp', 'id': '36b9d1b7'}\n", "{'text': 'Use ‘pandas.concat’ function (https://pandas.pydata.org/docs/reference/api/pandas.concat.html) to combine two dataframes. To combine two numpy arrays use numpy.concatenate (https://numpy.org/doc/stable/reference/generated/numpy.concatenate.html) function. So the code would be as follows:\\ndf_train_combined = pd.concat([df_train, df_val])\\ny_train = np.concatenate((y_train, y_val), axis=0)\\n(George Chizhmak)', 'section': '2. Machine Learning for Regression', 'question': 'How to combine train and validation datasets', 'course': 'machine-learning-zoomcamp', 'id': '3c8b32a1'}\n", "{'text': 'The Root Mean Squared Error (RMSE) is one of the primary metrics to evaluate the performance of a regression model. It calculates the average deviation between the model\\'s predicted values and the actual observed values, offering insight into the model\\'s ability to accurately forecast the target variable. To calculate RMSE score:\\nLibraries needed\\nimport numpy as np\\nfrom sklearn.metrics import mean_squared_error\\nmse = mean_squared_error(actual_values, predicted_values)\\nrmse = np.sqrt(mse)\\nprint(\"Root Mean Squared Error (RMSE):\", rmse)\\n(Aminat Abolade)', 'section': '2. Machine Learning for Regression', 'question': 'Understanding RMSE and how to calculate RMSE score', 'course': 'machine-learning-zoomcamp', 'id': '05fb3a16'}\n", "{'text': 'If you would like to use multiple conditions as an example below you will get the error. The correct syntax for OR is |, and for AND is &\\n(Olga Rudakova)\\n–', 'section': '2. Machine Learning for Regression', 'question': 'What syntax use in Pandas for multiple conditions using logical AND and OR', 'course': 'machine-learning-zoomcamp', 'id': '225506b9'}\n", "{'text': 'I found this video pretty usual for understanding how we got the normal form with linear regression Normal Equation Derivation for Regression', 'section': '2. Machine Learning for Regression', 'question': 'Deep dive into normal equation for regression', 'course': 'machine-learning-zoomcamp', 'id': 'bd4a1395'}\n", "{'text': '(Hrithik Kumar Advani)', 'section': '2. Machine Learning for Regression', 'question': 'Useful Resource for Missing Data Treatment\\nhttps://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python/notebook', 'course': 'machine-learning-zoomcamp', 'id': '81b8e8d0'}\n", "{'text': 'The instruction for applying log transformation to the ‘median_house_value’ variable is provided before Q3 in the homework for Week-2 under the ‘Prepare and split the dataset’ heading.\\nHowever, this instruction is absent in the subsequent questions of the homework, and I got stuck with Q5 for a long time, trying to figure out why my RMSE was so huge, when it clicked to me that I forgot to apply log transformation to the target variable. Please remember to apply log transformation to the target variable for each question.\\n(Added by Soham Mundhada)', 'section': '2. Machine Learning for Regression', 'question': 'Caution for applying log transformation in Week-2 2023 cohort homework', 'course': 'machine-learning-zoomcamp', 'id': 'a7f6a33c'}\n", "{'text': 'Version 0.24.2 and Python 3.8.11\\n(Added by Diego Giraldo)', 'section': '3. Machine Learning for Classification', 'question': 'What sklearn version is Alexey using in the youtube videos?', 'course': 'machine-learning-zoomcamp', 'id': '129b4ac0'}\n", "{'text': 'Week 3 HW: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/03-classification/homework.md\\nSubmit HW Week 3: https://docs.google.com/forms/d/e/1FAIpQLSeXS3pqsv_smRkYmVx-7g6KIZDnG29g2s7pdHo-ASKNqtfRFQ/viewform\\nAll HWs: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/\\nEvaluation Matrix: https://docs.google.com/spreadsheets/d/e/2PACX-1vQCwqAtkjl07MTW-SxWUK9GUvMQ3Pv_fF8UadcuIYLgHa0PlNu9BRWtfLgivI8xSCncQs82HDwGXSm3/pubhtml\\nGitHub for theory: https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/course-zoomcamp\\nYoutube Link: 3.X --- https://www.youtube.com/watch?v=0Zw04wdeTQo&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=29\\n~~Nukta Bhatia~~', 'section': '3. Machine Learning for Classification', 'question': 'How do I get started with Week 3?', 'course': 'machine-learning-zoomcamp', 'id': 'b8cca8b7'}\n", "{'text': \"The error message “could not convert string to float: ‘Nissan’” typically occurs when a machine learning model or function is expecting numerical input, but receives a string instead. In this case, it seems like the model is trying to convert the car brand ‘Nissan’ into a numerical value, which isn’t possible.\\nTo resolve this issue, you can encode categorical variables like car brands into numerical values. One common method is one-hot encoding, which creates new binary columns for each category/label present in the original column.\\nHere’s an example of how you can perform one-hot encoding using pandas:\\nimport pandas as pd\\n# Assuming 'data' is your DataFrame and 'brand' is the column with car brands\\ndata_encoded = pd.get_dummies(data, columns=['brand'])\\nIn this code, pd.get_dummies() creates a new DataFrame where the ‘brand’ column is replaced with binary columns for each brand (e.g., ‘brand_Nissan’, ‘brand_Toyota’, etc.). Each row in the DataFrame has a 1 in the column that corresponds to its brand and 0 in all other brand columns.\\n-Mohammad Emad Sharifi-\", 'section': '3. Machine Learning for Classification', 'question': \"Could not convert string to float:’Nissan’rt string to float: 'Nissan'\", 'course': 'machine-learning-zoomcamp', 'id': '1091b10f'}\n", "{'text': 'Solution: Mutual Information score calculates the relationship between categorical variables or discrete variables. So in the homework, because the target which is median_house_value is continuous, we had to change it to binary format which in other words, makes its values discrete as either 0 or 1. If we allowed it to remain in the continuous variable format, the mutual information score could be calculated, but the algorithm would have to divide the continuous variables into bins and that would be highly subjective. That is why continuous variables are not used for mutual information score calculation.\\n—Odimegwu David—-', 'section': '3. Machine Learning for Classification', 'question': 'Why did we change the targets to binary format when calculating mutual information score in the homework?', 'course': 'machine-learning-zoomcamp', 'id': '0c7715a1'}\n", "{'text': \"Q2 asks about correlation matrix and converting median_house_value from numeric to binary. Just to make sure here we are only dealing with df_train not df_train_full, right? As the question explicitly mentions the train dataset.\\nYes. I think it is only on df_train. The reason behind this is that df_train_full also contains the validation dataset, so at this stage we don't want to make conclusions based on the validation data, since we want to test how we did without using that portion of the data.\\nPastor Soto\", 'section': '3. Machine Learning for Classification', 'question': 'What data should we use for correlation matrix', 'course': 'machine-learning-zoomcamp', 'id': 'd2043cf5'}\n", "{'text': \"The background of any dataframe can be colored (not only the correlation matrix) based on the numerical values the dataframe contains by using the method pandas.io.formats.style.Styler.background_graident.\\nHere an example on how to color the correlation matrix. A color map of choice can get passed, here ‘viridis’ is used.\\n# ensure to have only numerical values in the dataframe before calling 'corr'\\ncorr_mat = df_numerical_only.corr()\\ncorr_mat.style.background_gradient(cmap='viridis')\\nHere is an example of how the coloring will look like using a dataframe containing random values and applying “background_gradient” to it.\\nnp.random.seed = 3\\ndf_random = pd.DataFrame(data=np.random.random(3*3).reshape(3,3))\\ndf_random.style.background_gradient(cmap='viridis')\\nAdded by Sylvia Schmitt\", 'section': '3. Machine Learning for Classification', 'question': 'Coloring the background of the pandas.DataFrame.corr correlation matrix directly', 'course': 'machine-learning-zoomcamp', 'id': '44d22817'}\n", "{'text': 'data_corr = pd.DataFrame(data_num.corr().round(3).abs().unstack().sort_values(ascending=False))\\ndata_corr.head(10)\\nAdded by Harish Balasundaram\\nYou can also use seaborn to create a heatmap with the correlation. The code for doing that:\\nsns.heatmap(df[numerical_features].corr(),\\nannot=True,\\nsquare=True,\\nfmt=\".2g\",\\ncmap=\"crest\")\\nAdded by Cecile Guillot\\nYou can refine your heatmap and plot only a triangle, with a blue to red color gradient, that will show every correlation between your numerical variables without redundant information with this function:\\nWhich outputs, in the case of churn dataset:\\n(Mélanie Fouesnard)', 'section': '3. Machine Learning for Classification', 'question': 'Identifying highly correlated feature pairs easily through unstack', 'course': 'machine-learning-zoomcamp', 'id': '1f76dbeb'}\n", "{'text': \"Should we perform EDA on the base of train or train+validation or train+validation+test dataset?\\nIt's indeed good practice to only rely on the train dataset for EDA. Including validation might be okay. But we aren't supposed to touch the test dataset, even just looking at it isn't a good idea. We indeed pretend that this is the future unseen data\\nAlena Kniazeva\", 'section': '3. Machine Learning for Classification', 'question': 'What data should be used for EDA?', 'course': 'machine-learning-zoomcamp', 'id': 'b8071a54'}\n", "{'text': 'Validation dataset helps to validate models and prediction on unseen data. This helps get an estimate on its performance on fresh data. It helps optimize the model.\\nEdidiong Esu\\nBelow is an extract of Alexey\\'s book explaining this point. Hope is useful\\nWhen we apply the fit method, this method is looking at the content of the df_train dictionaries we are passing to the DictVectorizer instance, and fit is figuring out (training) how to map the values of these dictionaries. If categorical, applies one-hot encoding, if numerical it will leave it as it is.\\nWith this context, if we apply the fit to the validation model, we are \"giving the answers\" and we are not letting the \"fit\" do its job for data that we haven\\'t seen. By not applying the fit to the validation model we can know how well it was trained.\\nBelow is an extract of Alexey\\'s book explaining this point.\\nHumberto Rodriguez\\nThere is no need to initialize another instance of dictvectorizer after fitting it on the train set as it will overwrite what it learnt from being fit on the train data.\\nThe correct way is to fit_transform the train set, and only transform the validation and test sets.\\nMemoona Tahira', 'section': '3. Machine Learning for Classification', 'question': 'Fitting DictVectorizer on validation', 'course': 'machine-learning-zoomcamp', 'id': 'b8da9037'}\n", "{'text': 'For Q5 in homework, should we calculate the smallest difference in accuracy in real values (i.e. -0.001 is less than -0.0002) or in absolute values (i.e. 0.0002 is less than 0.001)?\\nWe should select the “smallest” difference, and not the “lowest”, meaning we should reason in absolute values.\\nIf the difference is negative, it means that the model actually became better when we removed the feature.', 'section': '3. Machine Learning for Classification', 'question': 'Feature elimination', 'course': 'machine-learning-zoomcamp', 'id': '467e0cec'}\n", "{'text': \"Instead use the method “.get_feature_names_out()” from DictVectorizer function and the warning will be resolved , but we need not worry about the waning as there won't be any warning\\nSanthosh Kumar\", 'section': '3. Machine Learning for Classification', 'question': 'FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2', 'course': 'machine-learning-zoomcamp', 'id': 'b69f32f6'}\n", "{'text': 'Fitting the logistic regression takes a long time / kernel crashes when calling predict() with the fitted model.\\nMake sure that the target variable for the logistic regression is binary.\\nKonrad Muehlberg', 'section': '3. Machine Learning for Classification', 'question': 'Logistic regression crashing Jupyter kernel', 'course': 'machine-learning-zoomcamp', 'id': '3b3b1989'}\n", "{'text': 'Ridge regression is a linear regression technique used to mitigate the problem of multicollinearity (when independent variables are highly correlated) and prevent overfitting in predictive modeling. It adds a regularization term to the linear regression cost function, penalizing large coefficients.\\nsag Solver: The sag solver stands for \"Stochastic Average Gradient.\" It\\'s particularly suitable for large datasets, as it optimizes the regularization term using stochastic gradient descent (SGD). sag can be faster than some other solvers for large datasets.\\nAlpha: The alpha parameter controls the strength of the regularization in Ridge regression. A higher alpha value leads to stronger regularization, which means the model will have smaller coefficient values, reducing the risk of overfitting.\\nfrom sklearn.linear_model import Ridge\\nridge = Ridge(alpha=alpha, solver=\\'sag\\', random_state=42)\\nridge.fit(X_train, y_train)\\nAminat Abolade', 'section': '3. Machine Learning for Classification', 'question': 'Understanding Ridge', 'course': 'machine-learning-zoomcamp', 'id': 'eb5771a0'}\n", "{'text': 'DictVectorizer(sparse=True) produces CSR format, which is both more memory efficient and converges better during fit(). Basically it stores non-zero values and indices instead of adding a column for each class of each feature (models of cars produced 900+ columns alone in the current task).\\nUsing “sparse” format like on the picture above, both via pandas.get_dummies() and DictVectorizer(sparse=False) - is slower (around 6-8min for Q6 task - Linear/Ridge Regression) for high amount of classes (like models of cars for eg) and gives a bit “worse” results in both Logistic and Linear/Ridge Regression, while also producing convergence warnings for Linear/Ridge Regression.\\nLarkin Andrii', 'section': '3. Machine Learning for Classification', 'question': 'pandas.get_dummies() and DictVectorizer(sparse=False) produce the same type of one-hot encodings:', 'course': 'machine-learning-zoomcamp', 'id': 'bca10281'}\n", "{'text': 'Ridge with sag solver requires feature to be of the same scale. You may get the following warning: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\\nPlay with different scalers. See notebook-scaling-ohe.ipynb\\nDmytro Durach\\n(Oscar Garcia) Use a StandardScaler for the numeric fields and OneHotEncoder (sparce = False) for the categorical features. This help with the warning. Separate the features (num/cat) without using the encoder first and see if that helps.', 'section': '3. Machine Learning for Classification', 'question': 'Convergence Problems in W3Q6', 'course': 'machine-learning-zoomcamp', 'id': '34a8edb0'}\n", "{'text': \"When encountering convergence errors during the training of a Ridge regression model, consider the following steps:\\nFeature Normalization: Normalize your numerical features using techniques like MinMaxScaler or StandardScaler. This ensures that numerical features are on a \\tsimilar scale, preventing convergence issues.\\nCategorical Feature Encoding: If your dataset includes categorical features, apply \\tcategorical encoding techniques such as OneHotEncoder (OHE) to \\tconvert them into a numerical format. OHE is commonly used to represent categorical variables as binary vectors, making them compatible with regression models like Ridge.\\nCombine Features: After \\tnormalizing numerical features and encoding categorical features using OneHotEncoder, combine them to form a single feature matrix (X_train). This combined dataset serves as the input for training the Ridge regression model.\\nBy following these steps, you can address convergence errors and enhance the stability of your Ridge model training process. It's important to note that the choice of encoding method, such as OneHotEncoder, is appropriate for handling categorical features in this context.\\nYou can find an example here.\\n \\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tOsman Ali\", 'section': '3. Machine Learning for Classification', 'question': 'Dealing with Convergence in Week 3 q6', 'course': 'machine-learning-zoomcamp', 'id': 'f625307b'}\n", "{'text': 'A sparse matrix is more memory-efficient because it only stores the non-zero values and their positions in memory. This is particularly useful when working with large datasets with many zero or missing values.\\nThe default DictVectorizer configuration is a sparse matrix. For week3 Q6 using the default sparse is an interesting option because of the size of the matrix. Training the model was also more performant and didn’t give an error message like dense mode.\\n \\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tQuinn Avila', 'section': '3. Machine Learning for Classification', 'question': 'Sparse matrix compared dense matrix', 'course': 'machine-learning-zoomcamp', 'id': '7fa98526'}\n", "{'text': 'The warnings on the jupyter notebooks can be disabled/ avoided with the following comments:\\nImport warnings\\nwarnings.filterwarnings(“ignore”)\\nKrishna Anand', 'section': '3. Machine Learning for Classification', 'question': 'How to Disable/avoid Warnings in Jupyter Notebooks', 'course': 'machine-learning-zoomcamp', 'id': '0807f0f3'}\n", "{'text': 'Question: Regarding RMSE, how do we decide on the correct score to choose? In the study group discussion about week two homework, all of us got it wrong and one person had the lowest score selected as well.\\nAnswer: You need to find RMSE for each alpha. If RMSE scores are equal, you will select the lowest alpha.\\nAsia Saeed', 'section': '3. Machine Learning for Classification', 'question': 'How to select the alpha parameter in Q6', 'course': 'machine-learning-zoomcamp', 'id': '6d0fb418'}\n", "{'text': 'Question: Could you please help me with HW3 Q3: \"Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.\" What is the second variable that we need to use to calculate the mutual information score?\\nAnswer: You need to calculate the mutual info score between the binarized price (above_average) variable & ocean_proximity, the only original categorical variable in the dataset.\\nAsia Saeed', 'section': '3. Machine Learning for Classification', 'question': 'Second variable that we need to use to calculate the mutual information score', 'course': 'machine-learning-zoomcamp', 'id': 'fbda1f40'}\n", "{'text': 'Do we need to train the model only with the features: total_rooms, total_bedrooms, population and households? or with all the available features and then pop once at a time each of the previous features and train the model to make the accuracy comparison?\\nYou need to create a list of all features in this question and evaluate the model one time to obtain the accuracy, this will be the original accuracy, and then remove one feature each time, and in each time, train the model, find the accuracy and the difference between the original accuracy and the found accuracy. Finally, find out which feature has the smallest absolute accuracy difference.\\nWhile calculating differences between accuracy scores while training on the whole model, versus dropping one feature at a time and comparing its accuracy to the model to judge impact of the feature on the accuracy of the model, do we take the smallest difference or smallest absolute difference?\\nSince order of subtraction between the two accuracy scores can result in a negative number, we will take its absolute value as we are interested in the smallest value difference, not the lowest difference value. Case in point, if difference is -4 and -2, the smallest difference is abs(-2), and not abs(-4)', 'section': '3. Machine Learning for Classification', 'question': 'Features for homework Q5', 'course': 'machine-learning-zoomcamp', 'id': '0f88b7ac'}\n", "{'text': 'Both work in similar ways. That is, to convert categorical features to numerical variables for use in training the model. But the difference lies in the input. OneHotEncoder uses an array as input while DictVectorizer uses a dictionary.\\nBoth will produce the same result. But when we use OneHotEncoder, features are sorted alphabetically. When you use DictVectorizer you stack features that you want.\\nTanya Mard', 'section': '3. Machine Learning for Classification', 'question': 'What is the difference between OneHotEncoder and DictVectorizer?', 'course': 'machine-learning-zoomcamp', 'id': '9ffcc895'}\n", "{'text': 'They are basically the same. There are some key differences with regards to their input/output types, handling of missing values, etc, but they are both techniques to one-hot-encode categorical variables with identical results. The biggest difference is get_dummies are a convenient choice when you are working with Pandas Dataframes, while if you are building a scikit-learn-based machine learning pipeline and need to handle categorical data as part of that pipeline, OneHotEncoder is a more suitable choice. [Abhirup Ghosh]', 'section': '3. Machine Learning for Classification', 'question': 'What is the difference between pandas get_dummies and sklearn OnehotEncoder?', 'course': 'machine-learning-zoomcamp', 'id': '94a3b2fb'}\n", "{'text': \"For the test_train_split question on week 3's homework, are we supposed to use 42 as the random_state in both splits or only the 1st one?\\nAnswer: for both splits random_state = 42 should be used\\n(Bhaskar Sarma)\", 'section': '3. Machine Learning for Classification', 'question': 'Use of random seed in HW3', 'course': 'machine-learning-zoomcamp', 'id': 'fb9a45d8'}\n", "{'text': 'Should correlation be calculated after splitting or before splitting. And lastly I know how to find the correlation but how do i find the two most correlated features.\\nAnswer: Correlation matrix of your train dataset. Thus, after splitting. Two most correlated features are the ones having the highest correlation coefficient in terms of absolute values.', 'section': '3. Machine Learning for Classification', 'question': 'Correlation before or after splitting the data', 'course': 'machine-learning-zoomcamp', 'id': 'e31051f7'}\n", "{'text': 'Make sure that the features used in ridge regression model are only NUMERICAL ones not categorical.\\nDrop all categorical features first before proceeding.\\n(Aileah Gotladera)\\nWhile it is True that ridge regression accepts only numerical values, the categorical ones can be useful for your model. You have to transform them using one-hot encoding before training the model. To avoid the error of non convergence, put sparse=True when doing so.\\n(Erjon)', 'section': '3. Machine Learning for Classification', 'question': 'Features in Ridge Regression Model', 'course': 'machine-learning-zoomcamp', 'id': '493b7b59'}\n", "{'text': \"You need to use all features. and price for target. Don't include the average variable we created before.\\nIf you use DictVectorizer then make sure to use sparce=True to avoid convergence errors\\nI also used StandardScalar for numerical variable you can try running with or without this\\n(Peter Pan)\", 'section': '3. Machine Learning for Classification', 'question': 'Handling Column Information for Homework 3 Question 6', 'course': 'machine-learning-zoomcamp', 'id': '4a55c510'}\n", "{'text': 'Use sklearn.preprocessing encoders and scalers, e.g. OneHotEncoder, OrdinalEncoder, and StandardScaler.', 'section': '3. Machine Learning for Classification', 'question': 'Transforming Non-Numerical Columns into Numerical Columns', 'course': 'machine-learning-zoomcamp', 'id': '3ca0b489'}\n", "{'text': 'These both methods receive the dictionary as an input. While the DictVectorizer will store the big vocabulary and takes more memory. FeatureHasher create a vectors with predefined length. They are both used for categorical features.\\nWhen you have a high cardinality for categorical features better to use FeatureHasher. If you want to preserve feature names in transformed data and have a small number of unique values is DictVectorizer. But your choice will dependence on your data.\\nYou can read more by follow the link https://scikit-learn.org/stable/auto_examples/text/plot_hashing_vs_dict_vectorizer.html\\nOlga Rudakova', 'section': '3. Machine Learning for Classification', 'question': 'What is the better option FeatureHasher or DictVectorizer', 'course': 'machine-learning-zoomcamp', 'id': '690d97f1'}\n", "{'text': '(Question by Connie S.)\\nThe reason it\\'s good/recommended practice to do it after splitting is to avoid data leakage - you don\\'t want any data from the test set influencing the training stage (similarly from the validation stage in the initial training). See e.g. scikit-learn documentation on \"Common pitfalls and recommended practices\": https://scikit-learn.org/stable/common_pitfalls.html\\nAnswered/added by Rileen Sinha', 'section': '3. Machine Learning for Classification', 'question': \"Isn't it easier to use DictVertorizer or get dummies before splitting the data into train/val/test? Is there a reason we wouldn't do this? Or is it the same either way?\", 'course': 'machine-learning-zoomcamp', 'id': 'eb5a25cb'}\n", "{'text': 'If you are getting 1.0 as accuracy then there is a possibility you have overfitted the model. Dropping the column msrp/price can help you solve this issue.\\nAdded by Akshar Goyal', 'section': '3. Machine Learning for Classification', 'question': 'HW3Q4 I am getting 1.0 as accuracy. Should I use the closest option?', 'course': 'machine-learning-zoomcamp', 'id': '6d9e0a6f'}\n", "{'text': 'We can use sklearn & numpy packages to calculate Root Mean Squared Error\\nfrom sklearn.metrics import mean_squared_error\\nimport numpy as np\\nRmse = np.sqrt(mean_squared_error(y_pred, y_val/ytest)\\nAdded by Radikal Lukafiardi\\nYou can also refer to Alexey’s notebook for Week 2:\\nhttps://github.com/alexeygrigorev/mlbookcamp-code/blob/master/chapter-02-car-price/02-carprice.ipynb\\nwhich includes the following code:\\ndef rmse(y, y_pred):\\nerror = y_pred - y\\nmse = (error ** 2).mean()\\nreturn np.sqrt(mse)\\n(added by Rileen Sinha)', 'section': '3. Machine Learning for Classification', 'question': 'How to calculate Root Mean Squared Error?', 'course': 'machine-learning-zoomcamp', 'id': '618ad97a'}\n", "{'text': 'The solution is to use “get_feature_names_out” instead. See details: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html\\nGeorge Chizhmak', 'section': '3. Machine Learning for Classification', 'question': \"AttributeError: 'DictVectorizer' object has no attribute 'get_feature_names'\", 'course': 'machine-learning-zoomcamp', 'id': '683495d2'}\n", "{'text': 'To use RMSE without math or numpy, ‘sklearn.metrics’ has a mean_squared_error function with a squared kwarg (defaults to True). Setting squared to False will return the RMSE.\\nfrom sklearn.metrics import mean_squared_error\\nrms = mean_squared_error(y_actual, y_predicted, squared=False)\\nSee details: https://stackoverflow.com/questions/17197492/is-there-a-library-function-for-root-mean-square-error-rmse-in-python\\nAhmed Okka', 'section': '3. Machine Learning for Classification', 'question': 'Root Mean Squared Error', 'course': 'machine-learning-zoomcamp', 'id': 'dc1897b5'}\n", "{'text': 'This article explains different encoding techniques used https://towardsdatascience.com/all-about-categorical-variable-encoding-305f3361fd02\\nHrithik Kumar Advani', 'section': '3. Machine Learning for Classification', 'question': 'Encoding Techniques', 'course': 'machine-learning-zoomcamp', 'id': '826098f2'}\n", "{'text': \"I got this error multiple times here is the code:\\n“accuracy_score(y_val, y_pred >= 0.5)”\\nTypeError: 'numpy.float64' object is not callable\\nI solve it using\\nfrom sklearn import metrics\\nmetrics.accuracy_score(y_train, y_pred>= 0.5)\\nOMAR Wael\", 'section': '4. Evaluation Metrics for Classification', 'question': 'Error in use of accuracy_score from sklearn in jupyter (sometimes)', 'course': 'machine-learning-zoomcamp', 'id': '821dfc08'}\n", "{'text': 'Week 4 HW: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/04-evaluation/homework.md\\nAll HWs: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/\\nEvaluation Matrix: https://docs.google.com/spreadsheets/d/e/2PACX-1vQCwqAtkjl07MTW-SxWUK9GUvMQ3Pv_fF8UadcuIYLgHa0PlNu9BRWtfLgivI8xSCncQs82HDwGXSm3/pubhtml\\nGitHub for theory: https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/course-zoomcamp\\nYouTube Link: 4.X --- https://www.youtube.com/watch?v=gmg5jw1bM8A&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=40\\nSci-Kit Learn on Evaluation:\\nhttps://scikit-learn.org/stable/model_selection.html\\n~~Nukta Bhatia~~', 'section': '4. Evaluation Metrics for Classification', 'question': 'How do I get started with Week 4?', 'course': 'machine-learning-zoomcamp', 'id': '27c8d5da'}\n", "{'text': 'https://datatalks-club.slack.com/archives/C0288NJ5XSA/p1696475675887119\\nMetrics can be used on a series or a dataframe\\n~~Ella Sahnan~~', 'section': '4. Evaluation Metrics for Classification', 'question': 'Using a variable to score', 'course': 'machine-learning-zoomcamp', 'id': 'a52d4739'}\n", "{'text': 'Ie particularly in module-04 homework Qn2 vs Qn5. https://datatalks-club.slack.com/archives/C0288NJ5XSA/p1696760905214979\\nRefer to the sklearn docs, random_state is to ensure the “randomness” that is used to shuffle dataset is reproducible, and it usually requires both random_state and shuffle params to be set accordingly.\\n~~Ella Sahnan~~', 'section': '4. Evaluation Metrics for Classification', 'question': 'Why do we sometimes use random_state and not at other times?', 'course': 'machine-learning-zoomcamp', 'id': 'dc55359c'}\n", "{'text': 'How to get classification metrics - precision, recall, f1 score, accuracy simultaneously\\nUse classification_report from sklearn. For more info check here.\\nAbhishek N', 'section': '4. Evaluation Metrics for Classification', 'question': 'How to get all classification metrics?', 'course': 'machine-learning-zoomcamp', 'id': '2ab49e43'}\n", "{'text': 'I am getting multiple thresholds with the same F1 score, does this indicate I am doing something wrong or is there a method for choosing? I would assume just pick the lowest?\\nChoose the one closest to any of the options\\nAdded by Azeez Enitan Edunwale\\nYou can always use scikit-learn (or other standard libraries/packages) to verify results obtained using your own code, e.g. you can use “classification_report” (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) to obtain precision, recall and F1-score.\\nAdded by Rileen Sinha', 'section': '4. Evaluation Metrics for Classification', 'question': 'Multiple thresholds for Q4', 'course': 'machine-learning-zoomcamp', 'id': 'b431e7eb'}\n", "{'text': \"Solution description: duplicating the\\ndf.churn = (df.churn == 'yes').astype(int)\\nThis is causing you to have only 0's in your churn column. In fact, match with the error you are getting: ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.\\nIt is telling us that it only contains 0's.\\nDelete one of the below cells and you will get the accuracy\\nHumberto Rodriguez\", 'section': '4. Evaluation Metrics for Classification', 'question': 'ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0', 'course': 'machine-learning-zoomcamp', 'id': 'c5fdeba9'}\n", "{'text': 'Use Yellowbrick. Yellowbrick in a library that combines scikit-learn with matplotlib to produce visualizations for your models. It produces colorful classification reports.\\nKrishna Annad', 'section': '4. Evaluation Metrics for Classification', 'question': 'Method to get beautiful classification report', 'course': 'machine-learning-zoomcamp', 'id': 'b8c9eaf1'}\n", "{'text': 'That’s fine, use the closest option', 'section': '4. Evaluation Metrics for Classification', 'question': 'I’m not getting the exact result in homework', 'course': 'machine-learning-zoomcamp', 'id': 'c54058a1'}\n", "{'text': 'Check the solutions from the 2021 iteration of the course. You should use roc_auc_score.', 'section': '4. Evaluation Metrics for Classification', 'question': 'Use AUC to evaluate feature importance of numerical variables', 'course': 'machine-learning-zoomcamp', 'id': 'b4b85c4b'}\n", "{'text': 'When calculating the ROC AUC score using sklearn.metrics.roc_auc_score the function expects two parameters “y_true” and “y_score”. So for each numerical value in the dataframe it will be passed as the “y_score” to the function and the target variable will get passed a “y_true” each time.\\nSylvia Schmitt', 'section': '4. Evaluation Metrics for Classification', 'question': 'Help with understanding: “For each numerical value, use it as score and compute AUC”', 'course': 'machine-learning-zoomcamp', 'id': '7d40f6f6'}\n", "{'text': 'You must use the `dt_val` dataset to compute the metrics asked in Question 3 and onwards, as you did in Question 2.\\nDiego Giraldo', 'section': '4. Evaluation Metrics for Classification', 'question': 'What dataset should I use to compute the metrics in Question 3', 'course': 'machine-learning-zoomcamp', 'id': 'f5dc446c'}\n", "{'text': \"What does this line do?\\nKFold(n_splits=n_splits, shuffle=True, random_state=1)\\nIf I do it inside the loop [0.01, 0.1, 1, 10] or outside the loop in Q6, HW04 it doesn't make any difference to my answers. I am wondering why and what is the right way, although it doesn't make a difference!\\nDid you try using a different random_state? From my understanding, KFold just makes N (which is equal to n_splits) separate pairs of datasets (train+val).\\nhttps://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html\\nIn my case changing random state changed results\\n(Arthur Minakhmetov)\\nChanging the random state makes a difference in my case too, but not whether it is inside or outside the for loop. I think I have got the answer. kFold = KFold(n_splits=n_splits, shuffle = True, random_state = 1) is just a generator object and it contains only the information n_splits, shuffle and random_state. The k-fold splitting actually happens in the next for loop for train_idx, val_idx in kFold.split(df_full_train): . So it doesn't matter where we generate the object, before or after the first loop. It will generate the same information. But from the programming point of view, it is better to do it before the loop. No point doing it again and again inside the loop\\n(Bhaskar Sarma)\\nIn case of KFold(n_splits=n_splits, shuffle=True, random_state=1) and C= [0.01, 0.1, 1, 10], it is better to loop through the different values of Cs as the video explained. I had separate train() and predict() functions, which were reused after dividing the dataset via KFold. The model ran about 10 minutes and provided a good score.\\n(Ani Mkrtumyan)\", 'section': '4. Evaluation Metrics for Classification', 'question': 'What does KFold do?', 'course': 'machine-learning-zoomcamp', 'id': 'd30fc29d'}\n", "{'text': \"I’m getting “ValueError: multi_class must be in ('ovo', 'ovr')” when using roc_auc_score to evaluate feature importance of numerical variables in question 1.\\nI was getting this error because I was passing the parameters to roc_auc_score incorrectly (df_train[col] , y_train) . The correct way is to pass the parameters in this way: roc_auc_score(y_train, df_train[col])\\nAsia Saeed\", 'section': '4. Evaluation Metrics for Classification', 'question': \"ValueError: multi_class must be in ('ovo', 'ovr')\", 'course': 'machine-learning-zoomcamp', 'id': '8eca9f73'}\n", "{'text': 'from tqdm.auto import tqdm\\nTqdm - terminal progress bar\\nKrishna Anand', 'section': '4. Evaluation Metrics for Classification', 'question': 'Monitoring Wait times and progress of the code execution can be done with:', 'course': 'machine-learning-zoomcamp', 'id': '7b9eb7f7'}\n", "{'text': 'Inverting or negating variables with ROC AUC scores less than the threshold is a valuable technique to improve feature importance and model performance when dealing with negatively correlated features. It helps ensure that the direction of the correlation aligns with the expectations of most machine learning algorithms.\\nAileah Gotladera', 'section': '4. Evaluation Metrics for Classification', 'question': 'What is the use of inverting or negating the variables less than the threshold?', 'course': 'machine-learning-zoomcamp', 'id': 'c4aaeed9'}\n", "{'text': 'In case of using predict(X) for this task we are getting the binary classification predictions which are 0 and 1. This may lead to incorrect evaluation values.\\nThe solution is to use predict_proba(X)[:,1], where we get the probability that the value belongs to one of the classes.\\nVladimir Yesipov\\nPredict_proba shows probailites per class.\\nAni Mkrtumyan', 'section': '4. Evaluation Metrics for Classification', 'question': 'Difference between predict(X) and predict_proba(X)[:, 1]', 'course': 'machine-learning-zoomcamp', 'id': '3af31e2a'}\n", "{'text': 'For churn/not churn predictions, I need help to interpret the following scenario please, what is happening when:\\nThe threshold is 1.0\\nFPR is 0.0\\nAnd TPR is 0.0\\nWhen the threshold is 1.0, the condition for belonging to the positive class (churn class) is g(x)>=1.0 But g(x) is a sigmoid function for a binary classification problem. It has values between 0 and 1. This function never becomes equal to outermost values, i.e. 0 and 1.\\nThat is why there is no object, for which churn-condition could be satisfied. And that is why there is no any positive (churn) predicted value (neither true positive, nor false positive), if threshold is equal to 1.0\\nAlena Kniazeva', 'section': '4. Evaluation Metrics for Classification', 'question': 'Why are FPR and TPR equal to 0.0, when threshold = 1.0?', 'course': 'machine-learning-zoomcamp', 'id': '746342ff'}\n", "{'text': \"Matplotlib has a cool method to annotate where you could provide an X,Y point and annotate with an arrow and text. For example this will show an arrow pointing to the x,y point optimal threshold.\\nplt.annotate(f'Optimal Threshold: {optimal_threshold:.2f}\\\\nOptimal F1 Score: {optimal_f1_score:.2f}',\\nxy=(optimal_threshold, optimal_f1_score),\\nxytext=(0.3, 0.5),\\ntextcoords='axes fraction',\\narrowprops=dict(facecolor='black', shrink=0.05))\\nQuinn Avila\", 'section': '4. Evaluation Metrics for Classification', 'question': 'How can I annotate a graph?', 'course': 'machine-learning-zoomcamp', 'id': 'bda2c9b3'}\n", "{'text': \"It's a complex and abstract topic and it requires some time to understand. You can move on without fully understanding the concept.\\nNonetheless, it might be useful for you to rewatch the video, or even watch videos/lectures/notes by other people on this topic, as the ROC AUC is one of the most important metrics used in Binary Classification models.\", 'section': '4. Evaluation Metrics for Classification', 'question': 'I didn’t fully understand the ROC curve. Can I move on?', 'course': 'machine-learning-zoomcamp', 'id': '41521c92'}\n", "{'text': 'One main reason behind that, is the way of splitting data. For example, we want to split data into train/validation/test with the ratios 60%/20%/20% respectively.\\nAlthough the following two options end up with the same ratio, the data itself is a bit different and not 100% matching in each case.\\n1)\\ndf_train, df_temp = train_test_split(df, test_size=0.4, random_state=42)\\ndf_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)\\n2)\\ndf_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)\\ndf_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)\\nTherefore, I would recommend using the second method which is more consistent with the lessons and thus the homeworks.\\nIbraheem Taha', 'section': '4. Evaluation Metrics for Classification', 'question': 'Why do I have different values of accuracy than the options in the homework?', 'course': 'machine-learning-zoomcamp', 'id': '25481ce5'}\n", "{'text': 'You can find the intercept between these two curves using numpy diff (https://numpy.org/doc/stable/reference/generated/numpy.diff.html ) and sign (https://numpy.org/doc/stable/reference/generated/numpy.sign.html):\\nI suppose here that you have your df_scores ready with your three columns ‘threshold’, ‘precision’ and ‘recall’:\\nYou want to know at which index (or indices) you have your intercept between precision and recall (namely: where the sign of the difference between precision and recall changes):\\nidx = np.argwhere(\\nnp.diff(\\nnp.sign(np.array(df_scores[\"precision\"]) - np.array(df_scores[\"recall\"]))\\n)\\n).flatten()\\nYou can print the result to easily read it:\\nprint(\\nf\"The precision and recall curves intersect at a threshold equal to {df_scores.loc[idx][\\'threshold\\']}.\"\\n)\\n(Mélanie Fouesnard)', 'section': '4. Evaluation Metrics for Classification', 'question': 'How to find the intercept between precision and recall curves by using numpy?', 'course': 'machine-learning-zoomcamp', 'id': '1427d567'}\n", "{'text': \"In the demonstration video, we are shown how to calculate the precision and recall manually. You can use the Scikit Learn library to calculate the confusion matrix. precision, recall, f1_score without having to first define true positive, true negative, false positive, and false negative.\\nfrom sklearn.metrics import precision_score, recall_score, f1_score\\nprecision_score(y_true, y_pred, average='binary')\\nrecall_score(y_true, y_pred, average='binary')\\nf1_score(y_true, y_pred, average='binary')\\nRadikal Lukafiardi\", 'section': '4. Evaluation Metrics for Classification', 'question': 'Compute Recall, Precision, and F1 Score using scikit-learn library', 'course': 'machine-learning-zoomcamp', 'id': '76c91dfb'}\n", "{'text': 'Cross-validation evaluates the performance of a model and chooses the best hyperparameters. Cross-validation does this by splitting the dataset into multiple parts (folds), typically 5 or 10. It then trains and evaluates your model multiple times, each time using a different fold as the validation set and the remaining folds as the training set.\\n\"C\" is a hyperparameter that is typically associated with regularization in models like Support Vector Machines (SVM) and logistic regression.\\nSmaller \"C\" values: They introduce more regularization, which means the model will try to find a simpler decision boundary, potentially underfitting the data. This is because it penalizes the misclassification of training examples more severely.\\nLarger \"C\" values: They reduce the regularization effect, allowing the model to fit the training data more closely, potentially overfitting. This is because it penalizes misclassification less severely, allowing the model to prioritize getting training examples correct.\\nAminat Abolade', 'section': '4. Evaluation Metrics for Classification', 'question': 'Why do we use cross validation?', 'course': 'machine-learning-zoomcamp', 'id': 'e4dd91cf'}\n", "{'text': \"Model evaluation metrics can be easily computed using off the shelf calculations available in scikit learn library. This saves a lot of time and more precise compared to our own calculations from the scratch using numpy and pandas libraries.\\nfrom sklearn.metrics import (accuracy_score,\\nprecision_score,\\nrecall_score,\\nf1_score,\\nroc_auc_score\\n)\\naccuracy = accuracy_score(y_val, y_pred)\\nprecision = precision_score(y_val, y_pred)\\nrecall = recall_score(y_val, y_pred)\\nf1 = f1_score(y_val, y_pred)\\nroc_auc = roc_auc_score(y_val, y_pred)\\nprint(f'Accuracy: {accuracy}')\\nprint(f'Precision: {precision}')\\nprint(f'Recall: {recall}')\\nprint(f'F1-Score: {f1}')\\nprint(f'ROC AUC: {roc_auc}')\\n(Harish Balasundaram)\", 'section': '4. Evaluation Metrics for Classification', 'question': 'Evaluate the Model using scikit learn metrics', 'course': 'machine-learning-zoomcamp', 'id': 'cc53ae94'}\n", "{'text': 'Scikit-learn offers another way: precision_recall_fscore_support\\nExample:\\nfrom sklearn.metrics import precision_recall_fscore_support\\nprecision, recall, fscore, support = precision_recall_fscore_support(y_val, y_val_pred, zero_division=0)\\n(Gopakumar Gopinathan)', 'section': '4. Evaluation Metrics for Classification', 'question': 'Are there other ways to compute Precision, Recall and F1 score?', 'course': 'machine-learning-zoomcamp', 'id': '403bbdd8'}\n", "{'text': '- ROC curves are appropriate when the observations are balanced between each class, whereas precision-recall curves are appropriate for imbalanced datasets.\\n- The reason for this recommendation is that ROC curves present an optimistic picture of the model on datasets with a class imbalance.\\n-This is because of the use of true negatives in the False Positive Rate in the ROC Curve and the careful avoidance of this rate in the Precision-Recall curve.\\n- If the proportion of positive to negative instances changes in a test set, the ROC curves will not change. Metrics such as accuracy, precision, lift and F scores use values from both columns of the confusion matrix. As a class distribution changes these measures will change as well, even if the fundamental classifier performance does not. ROC graphs are based upon TP rate and FP rate, in which each dimension is a strict columnar ratio, so cannot give an accurate picture of performance when there is class imbalance.\\n(Anudeep Vanjavakam)', 'section': '4. Evaluation Metrics for Classification', 'question': 'When do I use ROC vs Precision-Recall curves?', 'course': 'machine-learning-zoomcamp', 'id': '7c68ace0'}\n", "{'text': 'You can use roc_auc_score function from sklearn.metrics module and pass the vector of the target variable (‘above_average’) as the first argument and the vector of feature values as the second one. This function will return AUC score for the feature that was passed as a second argument.\\n(Denys Soloviov)', 'section': '4. Evaluation Metrics for Classification', 'question': 'How to evaluate feature importance for numerical variables with AUC?', 'course': 'machine-learning-zoomcamp', 'id': '147577f5'}\n", "{'text': 'Precision-recall curve, and thus the score, explicitly depends on the ratio of positive to negative test cases. This means that comparison of the F-score across different problems with differing class ratios is problematic. One way to address this issue is to use a standard class ratio when making such comparisons.\\n(George Chizhmak)', 'section': '4. Evaluation Metrics for Classification', 'question': 'Dependence of the F-score on class imbalance', 'course': 'machine-learning-zoomcamp', 'id': 'd3ffb802'}\n", "{'text': \"We can import precision_recall_curve from scikit-learn and plot the graph as follows:\\nfrom sklearn.metrics import precision_recall_curve\\nprecision, recall, thresholds = precision_recall_curve(y_val, y_predict)\\nplt.plot(thresholds, precision[:-1], label='Precision')\\nplt.plot(thresholds, recall[:-1], label='Recall')\\nplt.legend()\\nHrithik Kumar Advani\", 'section': '4. Evaluation Metrics for Classification', 'question': 'Quick way to plot Precision-Recall Curve', 'course': 'machine-learning-zoomcamp', 'id': 'cc04d27a'}\n", "{'text': 'For multiclass classification it is important to keep class balance when you split the data set. In this case Stratified k-fold returns folds that contains approximately the sme percentage of samples of each classes.\\nPlease check the realisation in sk-learn library:\\nhttps://scikit-learn.org/stable/modules/cross_validation.html#stratified-k-fold\\nOlga Rudakova', 'section': '5. Deploying Machine Learning Models', 'question': 'What is Stratified k-fold?', 'course': 'machine-learning-zoomcamp', 'id': '927b5e09'}\n", "{'text': 'Week 5 HW: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/05-deployment/homework.md\\nAll HWs: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/\\nHW 3 Solution: https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/course-zoomcamp/cohorts/2022/03-classification/homework_3.ipynb\\nEvaluation Matrix: https://docs.google.com/spreadsheets/d/e/2PACX-1vQCwqAtkjl07MTW-SxWUK9GUvMQ3Pv_fF8UadcuIYLgHa0PlNu9BRWtfLgivI8xSCncQs82HDwGXSm3/pubhtml\\nGitHub for theory: https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/course-zoomcamp\\nYouTube Link: 5.X --- https://www.youtube.com/watch?v=agIFak9A3m8&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=49\\n~~~ Nukta Bhatia ~~~', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I get started with Week 5?', 'course': 'machine-learning-zoomcamp', 'id': 'd22efea7'}\n", "{'text': 'While weeks 1-4 can relatively easily be followed and the associated homework completed with just about any default environment / local setup, week 5 introduces several layers of abstraction and dependencies.\\nIt is advised to prepare your “homework environment” with a cloud provider of your choice. A thorough step-by-step guide for doing so for an AWS EC2 instance is provided in an introductory video taken from the MLOPS course here:\\nhttps://www.youtube.com/watch?v=IXSiYkP23zo\\nNote that (only) small AWS instances can be run for free, and that larger ones will be billed hourly based on usage (but can and should be stopped when not in use).\\nAlternative ways are sketched here:\\nhttps://github.com/alexeygrigorev/mlbookcamp-code/blob/master/course-zoomcamp/01-intro/06-environment.md', 'section': '5. Deploying Machine Learning Models', 'question': 'Errors related to the default environment: WSL, Ubuntu, proper Python version, installing pipenv etc.', 'course': 'machine-learning-zoomcamp', 'id': 'd1409f67'}\n", "{'text': \"You’ll need a kaggle account\\nGo to settings, API and click `Create New Token`. This will download a `kaggle.json` file which contains your `username` and `key` information\\nIn the same location as your Jupyter NB, place the `kaggle.json` file\\nRun `!chmod 600 /kaggle.json`\\nMake sure to import os via `import os` and then run:\\nos.environ['KAGGLE_CONFIG_DIR'] = \\nFinally you can run directly in your NB: `!kaggle datasets download -d kapturovalexander/bank-credit-scoring`\\nAnd then you can unzip the file and access the CSV via: `!unzip -o bank-credit-scoring.zip`\\n>>> Michael Fronda <<<\", 'section': '5. Deploying Machine Learning Models', 'question': 'How to download CSV data via Jupyter NB and the Kaggle API, for one seamless experience', 'course': 'machine-learning-zoomcamp', 'id': 'e07759e9'}\n", "{'text': 'Cd .. (go back)\\nLs (see current folders)\\nCd ‘path’/ (go to this path)\\nPwd (home)\\nCat “file name’ --edit txt file in ubuntu\\nAileah Gotladera', 'section': '5. Deploying Machine Learning Models', 'question': 'Basic Ubuntu Commands:', 'course': 'machine-learning-zoomcamp', 'id': '620fb76e'}\n", "{'text': 'Open terminal and type the code below to check the version on your laptop\\npython3 --version\\nFor windows,\\nVisit the official python website at https://www.python.org/downloads/ to download the python version you need for installation\\nRun the installer and ensure to check the box that says “Add Python to PATH” during installation and complete the installation by following the prompts\\nOr\\nFor Python 3,\\nOpen your command prompt or terminal and run the following command:\\npip install --upgrade python\\nAminat Abolade', 'section': '5. Deploying Machine Learning Models', 'question': 'Installing and updating to the python version 3.10 and higher', 'course': 'machine-learning-zoomcamp', 'id': '957280d8'}\n", "{'text': 'It is quite simple, and you can follow these instructions here:\\nhttps://www.youtube.com/watch?v=qYlgUDKKK5A&ab_channel=NeuralNine\\nMake sure that you have “Virtual Machine Platform” feature activated in your Windows “Features”. To do that, search “features” in the research bar and see if the checkbox is selected. You also need to make sure that your system (in the bios) is able to virtualize. This is usually the case.\\nIn the Microsoft Store: look for ‘Ubuntu’ or ‘Debian’ (or any linux distribution you want) and install it\\nOnce it is downloaded, open the app and choose a username and a password (secured one). When you type your password, nothing will show in the window, which is normal: the writing is invisible.\\nYou are now inside of your linux system. You can test some commands such as “pwd”. You are not in your Windows system.\\nTo go to your windows system: you need to go back two times with cd ../.. And then go to the “mnt” directory with cd mnt. If you list here your files, you will see your disks. You can move to the desired folder, for example here I moved to the ML_Zoomcamp folder:\\nPython should be already installed but you can check it by running sudo apt install python3 command.\\nYou can make your actual folder your default folder when you open your Ubuntu terminal with this command : echo \"cd ../../mnt/your/folder/path\" >> ~/.bashrc\\nYou can disable bell sounds (when you type something that does not exist for example) by modifying the inputrc file with this command: sudo vim /etc/inputrc\\nYou have to uncomment the set bell-style none line -> to do that, press the “i” keyboard letter (for insert) and go with your keyboard to this line. Delete the # and then press the Escape keyboard touch and finally press “:wq” to write (it saves your modifications) then quit.\\nYou can check that your modifications are taken into account by opening a new terminal (you can pin it to your task bar so you do not have to go to the Microsoft app each time).\\nYou will need to install pip by running this command sudo apt install python3-pip\\nNB: I had this error message when trying to install pipenv (https://github.com/microsoft/WSL/issues/5663):\\n/sbin/ldconfig.real: Can\\'t link /usr/lib/wsl/lib/libnvoptix_loader.so.1 to libnvoptix.so.1\\n/sbin/ldconfig.real: /usr/lib/wsl/lib/libcuda.so.1 is not a symbolic link\\nSo I had to create the following symbolic link:\\nsudo ln -s /usr/lib/wsl/lib/libcuda.so.1 /usr/lib64/libcuda.so\\n(Mélanie Fouesnard)', 'section': '5. Deploying Machine Learning Models', 'question': 'How to install WSL on Windows 10 and 11 ?', 'course': 'machine-learning-zoomcamp', 'id': '185096ad'}\n", "{'text': \"Do you get errors building the Docker image on the Mac M1 chipset?\\nThe error I was getting was:\\nCould not open '/lib64/ld-linux-x86-64.so.2': No such file or directory\\nThe fix (from here): vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\\nOpen mlbookcamp-code/course-zoomcamp/01-intro/environment/Dockerfile\\nReplace line 1 with\\nFROM --platform=linux/amd64 ubuntu:latest\\nNow build the image as specified. In the end it took over 2 hours to build the image but it did complete in the end.\\nDavid Colton\", 'section': '5. Deploying Machine Learning Models', 'question': 'Error building Docker images on Mac with M1 silicon', 'course': 'machine-learning-zoomcamp', 'id': 'ec88d101'}\n", "{'text': 'Import waitress\\nprint(waitress.__version__)\\nKrishna Anand', 'section': '5. Deploying Machine Learning Models', 'question': 'Method to find the version of any install python libraries in jupyter notebook', 'course': 'machine-learning-zoomcamp', 'id': '7156679d'}\n", "{'text': 'Working on getting Docker installed - when I try running hello-world I am getting the error.\\nDocker: Cannot connect to the docker daemon at unix:///var/run/docker.sock. Is the Docker daemon running ?\\nSolution description\\nIf you’re getting this error on WSL, re-install your docker: remove the docker installation from WSL and install Docker Desktop on your host machine (Windows).\\nOn Linux, start the docker daemon with either of these commands:\\nsudo dockerd\\nsudo service docker start\\nAdded by Ugochukwu Onyebuchi', 'section': '5. Deploying Machine Learning Models', 'question': 'Cannot connect to the docker daemon. Is the Docker daemon running?', 'course': 'machine-learning-zoomcamp', 'id': '4b2a3181'}\n", "{'text': 'After using the command “docker build -t churn-prediction .” to build the Docker image, the above error is raised and the image is not created.\\nIn your Dockerfile, change the Python version in the first line the Python version installed in your system:\\nFROM python:3.7.5-slim\\nTo find your python version, use the command python --version. For example:\\npython --version\\n>> Python 3.9.7\\nThen, change it on your Dockerfile:\\nFROM python:3.9.7-slim\\nAdded by Filipe Melo', 'section': '5. Deploying Machine Learning Models', 'question': \"The command '/bin/sh -c pipenv install --deploy --system && rm -rf /root/.cache' returned a non-zero code: 1\", 'course': 'machine-learning-zoomcamp', 'id': '73bd7fa1'}\n", "{'text': 'When the facilitator was adding sklearn to the virtual environment in the lectures, he used sklearn==0.24.1 and it ran smoothly. But while doing the homework and you are asked to use the 1.0.2 version of sklearn, it gives errors.\\nThe solution is to use the full name of sklearn. That is, run it as “pipenv install scikit-learn==1.0.2” and the error will go away, allowing you to install sklearn for the version in your virtual environment.\\nOdimegwu David\\nHomework asks you to install 1.3.1\\nPipenv install scikit-learn==1.3.1\\nUse Pipenv to install Scikit-Learn version 1.3.1\\nGopakumar Gopinathan', 'section': '5. Deploying Machine Learning Models', 'question': 'Running “pipenv install sklearn==1.0.2” gives errors. What should I do?', 'course': 'machine-learning-zoomcamp', 'id': 'a4d3b1e5'}\n", "{'text': 'What is the reason we don’t want to keep the docker image in our system and why do we need to run docker containers with `--rm` flag?\\nFor best practice, you don’t want to have a lot of abandoned docker images in your system. You just update it in your folder and trigger the build one more time.\\nThey consume extra space on your disk. Unless you don’t want to re-run the previously existing containers, it is better to use the `--rm` option.\\nThe right way to say: “Why do we remove the docker container in our system?”. Well the docker image is still kept; it is the container that is not kept. Upon execution, images are not modified; only containers are.\\nThe option `--rm` is for removing containers. The images remain until you remove them manually. If you don’t specify a version when building an image, it will always rebuild and replace the latest tag. `docker images` shows you all the image you have pulled or build so far.\\nDuring development and testing you usually specify `--rm` to get the containers auto removed upon exit. Otherwise they get accumulated in a stopped state, taking up space. `docker ps -a` shows you all the containers you have in your host. Each time you change Pipfile (or any file you baked into the container), you rebuild the image under the same tag or a new tag. It’s important to understand the difference between the term “docker image” and “docker container”. Image is what we build with all the resources baked in. You can move it around, maintain it in a repository, share it. Then we use the image to spin up instances of it and they are called containers.\\nAdded by Muhammad Awon', 'section': '5. Deploying Machine Learning Models', 'question': 'Why do we need the --rm flag', 'course': 'machine-learning-zoomcamp', 'id': '1d462fe0'}\n", "{'text': 'When you create the dockerfile the name should be dockerfile and needs to be without extension. One of the problems we can get at this point is to create the dockerfile as a dockerfile extension Dockerfile.dockerfile which creates an error when we build the docker image. Instead we just need to create the file without extension: Dockerfile and will run perfectly.\\nAdded by Pastor Soto', 'section': '5. Deploying Machine Learning Models', 'question': 'Failed to read Dockerfile', 'course': 'machine-learning-zoomcamp', 'id': '366d7563'}\n", "{'text': 'Refer to the page https://docs.docker.com/desktop/install/mac-install/ remember to check if you have apple chip or intel chip.', 'section': '5. Deploying Machine Learning Models', 'question': 'Install docker on MacOS', 'course': 'machine-learning-zoomcamp', 'id': 'cef156d1'}\n", "{'text': 'Problem: When I am trying to pull the image with the docker pull svizor/zoomcamp-model command I am getting an error:\\nUsing default tag: latest\\nError response from daemon: manifest for svizor/zoomcamp-model:latest not found: manifest unknown: manifest unknown\\nSolution: The docker by default uses the latest tag to avoid this use the correct tag from image description. In our case use command:\\ndocker pull svizor/zoomcamp-model:3.10.12-slim\\nAdded by Vladimir Yesipov', 'section': '5. Deploying Machine Learning Models', 'question': 'I cannot pull the image with docker pull command', 'course': 'machine-learning-zoomcamp', 'id': 'b632d2ea'}\n", "{'text': 'Using the command docker images or docker image ls will dump all information for all local Docker images. It is possible to dump the information only for a specified image by using:\\ndocker image ls \\nOr alternatively:\\ndocker images \\nIn action to that it is possible to only dump specific information provided using the option --format which will dump only the size for the specified image name when using the command below:\\ndocker image ls --format \"{{.Size}}\" \\nOr alternatively:\\ndocker images --format \"{{.Size}}\" \\nSylvia Schmitt', 'section': '5. Deploying Machine Learning Models', 'question': 'Dumping/Retrieving only the size of for a specific Docker image', 'course': 'machine-learning-zoomcamp', 'id': '514e27bb'}\n", "{'text': \"It creates them in\\nOSX/Linux: ~/.local/share/virtualenvs/folder-name_cyrptic-hash\\nWindows: C:\\\\Users\\\\\\\\.virtualenvs\\\\folder-name_cyrptic-hash\\nEg: C:\\\\Users\\\\Ella\\\\.virtualenvs\\\\code-qsdUdabf (for module-05 lesson)\\nThe environment name is the name of the last folder in the folder directory where we used the pipenv install command (or any other pipenv command). E.g. If you run any pipenv command in folder path ~/home/user/Churn-Flask-app, it will create an environment named Churn-Flask-app-some_random_characters, and it's path will be like this: /home/user/.local/share/virtualenvs/churn-flask-app-i_mzGMjX.\\nAll libraries of this environment will be installed inside this folder. To activate this environment, I will need to cd into the project folder again, and type pipenv shell. In short, the location of the project folder acts as an identifier for an environment, in place of any name.\\n(Memoona Tahira)\", 'section': '5. Deploying Machine Learning Models', 'question': 'Where does pipenv create environments and how does it name them?', 'course': 'machine-learning-zoomcamp', 'id': '5c67e086'}\n", "{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\\ndocker run -it --entrypoint bash \\nIf the container is already running, execute a command in the specific container:\\ndocker ps (find the container-id)\\ndocker exec -it bash\\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp', 'id': '63a81b57'}\n", "{'text': \"$ docker exec -it 1e5a1b663052 bash\\nthe input device is not a TTY. If you are using mintty, try prefixing the command with 'winpty'\\nFix:\\nwinpty docker exec -it 1e5a1b663052 bash\\nA TTY is a terminal interface that supports escape sequences, moving the cursor around, etc.\\nWinpty is a Windows software package providing an interface similar to a Unix pty-master for communicating with Windows console programs.\\nMore info on terminal, shell, console applications hi and so on:\\nhttps://conemu.github.io/en/TerminalVsShell.html\\n(Marcos MJD)\", 'section': '5. Deploying Machine Learning Models', 'question': 'The input device is not a TTY when running docker in interactive mode (Running Docker on Windows in GitBash)', 'course': 'machine-learning-zoomcamp', 'id': '047f57fb'}\n", "{'text': 'Initially, I did not assume there was a model2. I copied the original model1.bin and dv.bin. Then when I tried to load using\\nCOPY [\"model2.bin\", \"dv.bin\", \"./\"]\\nthen I got the error above in MINGW64 (git bash) on Windows.\\nThe temporary solution I found was to use\\nCOPY [\"*\", \"./\"]\\nwhich I assume combines all the files from the original docker image and the files in your working directory.\\nAdded by Muhammed Tan', 'section': '5. Deploying Machine Learning Models', 'question': 'Error: failed to compute cache key: \"/model2.bin\" not found: not found', 'course': 'machine-learning-zoomcamp', 'id': '11f7371c'}\n", "{'text': 'Create a virtual environment using the Cmd command (command) and use pip freeze command to write the requirements in the text file\\nKrishna Anand', 'section': '5. Deploying Machine Learning Models', 'question': 'Failed to write the dependencies to pipfile and piplock file', 'course': 'machine-learning-zoomcamp', 'id': '45f39b76'}\n", "{'text': 'f-String not properly keyed in: does anyone knows why i am getting error after import pickle?\\nThe first error showed up because your f-string is using () instead of {} around C. So, should be: f’model_C={C}.bin’\\nThe second error as noticed by Sriniketh, your are missing one parenthesis it should be pickle.dump((dv, model), f_out)\\n(Humberto R.)', 'section': '5. Deploying Machine Learning Models', 'question': 'f-strings', 'course': 'machine-learning-zoomcamp', 'id': '94e17563'}\n", "{'text': \"This error happens because pipenv is already installed but you can't access it from the path.\\nThis error comes out if you run.\\npipenv --version\\npipenv shell\\nSolution for Windows\\nOpen this option\\nClick here\\nClick in Edit Button\\nMake sure the next two locations are on the PATH, otherwise, add it.\\nC:\\\\Users\\\\AppData\\\\....\\\\Python\\\\PythonXX\\\\\\nC:\\\\Users\\\\AppData\\\\....\\\\Python\\\\PythonXX\\\\Scripts\\\\\\nAdded by Alejandro Aponte\\nNote: this answer assumes you don’t use Anaconda. For Windows, using Anaconda would be a better choice and less prone to errors.\", 'section': '5. Deploying Machine Learning Models', 'question': \"'pipenv' is not recognized as an internal or external command, operable program or batch file.\", 'course': 'machine-learning-zoomcamp', 'id': '9dd8efd2'}\n", "{'text': 'Following the instruction from video week-5.6, using pipenv to install python libraries throws below error\\nSolution to this error is to make sure that you are working with python==3.9 (as informed in the very first lesson of the zoomcamp) and not python==3.10.\\nAdded by Hareesh Tummala', 'section': '5. Deploying Machine Learning Models', 'question': 'AttributeError: module ‘collections’ has no attribute ‘MutableMapping’', 'course': 'machine-learning-zoomcamp', 'id': '9531dc92'}\n", "{'text': 'After entering `pipenv shell` don’t forget to use `exit` before `pipenv --rm`, as it may cause errors when trying to install packages, it is unclear whether you are “in the shell”(using Windows) at the moment as there are no clear markers for it.\\nIt can also mess up PATH, if that’s the case, here’s terminal commands for fixing that:\\n# for Windows\\nset VIRTUAL_ENV \"\"\\n# for Unix\\nexport VIRTUAL_ENV=\"\"\\nAlso manually re-creating removed folder at `C:\\\\Users\\\\username\\\\.virtualenvs\\\\removed-envname` can help, removed-envname can be seen at the error message.\\nAdded by Andrii Larkin', 'section': '5. Deploying Machine Learning Models', 'question': \"Q: ValueError: Path not found or generated: WindowsPath('C:/Users/username/.virtualenvs/envname/Scripts')\", 'course': 'machine-learning-zoomcamp', 'id': '14e0e697'}\n", "{'text': 'Set the host to ‘0.0.0.0’ on the flask app and dockerfile then RUN the url using localhost.\\n(Theresa S.)', 'section': '5. Deploying Machine Learning Models', 'question': \"ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))\", 'course': 'machine-learning-zoomcamp', 'id': '6189375f'}\n", "{'text': 'Solution:\\nThis error occurred because I used single quotes around the filenames. Stick to double quotes', 'section': '5. Deploying Machine Learning Models', 'question': 'docker build ERROR [x/y] COPY …', 'course': 'machine-learning-zoomcamp', 'id': '3419ee27'}\n", "{'text': 'I tried the first solution on Stackoverflow which recommended running `pipenv lock` to update the Pipfile.lock. However, this didn’t resolve it. But the following switch to the pipenv installation worked\\nRUN pipenv install --system --deploy --ignore-pipfile', 'section': '5. Deploying Machine Learning Models', 'question': 'Fix error during installation of Pipfile inside Docker container', 'course': 'machine-learning-zoomcamp', 'id': '8b8c1603'}\n", "{'text': 'Solution\\nThis error was because there was another instance of gunicorn running. So I thought of removing this along with the zoomcamp_test image. However, it didn’t let me remove the orphan container. So I did the following\\nRunning the following commands\\ndocker ps -a \\ndocker images \\ndocker stop \\ndocker rm \\ndocker rmi image\\nI rebuilt the Docker image, and ran it once again; this time it worked correctly and I was able to serve the test script to the endpoint.', 'section': '5. Deploying Machine Learning Models', 'question': 'How to fix error after running the Docker run command', 'course': 'machine-learning-zoomcamp', 'id': 'e54d5411'}\n", "{'text': 'I was getting the below error when I rebuilt the docker image although the port was not allocated, and it was working fine.\\nError message:\\nError response from daemon: driver failed programming external connectivity on endpoint beautiful_tharp (875be95c7027cebb853a62fc4463d46e23df99e0175be73641269c3d180f7796): Bind for 0.0.0.0:9696 failed: port is already allocated.\\nSolution description\\nIssue has been resolved running the following command:\\ndocker kill $(docker ps -q)\\nhttps://github.com/docker/for-win/issues/2722\\nAsia Saeed', 'section': '5. Deploying Machine Learning Models', 'question': 'Bind for 0.0.0.0:9696 failed: port is already allocated', 'course': 'machine-learning-zoomcamp', 'id': 'f7b38587'}\n", "{'text': 'I was getting the error on client side with this\\nClient Side:\\nFile \"C:\\\\python\\\\lib\\\\site-packages\\\\urllib3\\\\connectionpool.py\", line 703, in urlopen …………………..\\nraise ConnectionError(err, request=request)\\nrequests.exceptions.ConnectionError: (\\'Connection aborted.\\', RemoteDisconnected(\\'Remote end closed connection without response\\'))\\nSevrer Side:\\nIt showed error for gunicorn\\nThe waitress cmd was running smoothly from server side\\nSolution:\\nUse the ip-address as 0.0.0.0:8000 or 0.0.0.0:9696.They are the ones which do work max times\\nAamir Wani', 'section': '5. Deploying Machine Learning Models', 'question': 'Bind for 127.0.0.1:5000 showing error', 'course': 'machine-learning-zoomcamp', 'id': 'be86b333'}\n", "{'text': 'Install it by using command\\n% brew install md5sha1sum\\nThen run command to check hash for file to check if they the same with the provided\\n% md5sum model1.bin dv.bin\\nOlga Rudakova', 'section': '5. Deploying Machine Learning Models', 'question': 'Installing md5sum on Macos', 'course': 'machine-learning-zoomcamp', 'id': '4ea80460'}\n", "{'text': 'Problem description:\\nI started a web-server in terminal (command window, powershell, etc.). How can I run another python script, which makes a request to this server?\\nSolution description:\\nJust open another terminal (command window, powershell, etc.) and run a python script.\\nAlena Kniazeva', 'section': '5. Deploying Machine Learning Models', 'question': 'How to run a script while a web-server is working?', 'course': 'machine-learning-zoomcamp', 'id': '8006b496'}\n", "{'text': \"Problem description:\\nIn video 5.5 when I do pipenv shell and then pipenv run gunicorn --bind 0.0.0.0:9696 predict:app, I get the following warning:\\nUserWarning: Trying to unpickle estimator DictVectorizer from version 1.1.1 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\\nSolution description:\\nWhen you create a virtual env, you should use the same version of Scikit-Learn that you used for training the model on this case it's 1.1.1. There is version conflicts so we need to make sure our model and dv files are created from the version we are using for the project.\\nBhaskar Sarma\", 'section': '5. Deploying Machine Learning Models', 'question': 'Version-conflict in pipenv', 'course': 'machine-learning-zoomcamp', 'id': '704f95d8'}\n", "{'text': \"If you install packages via pipenv install, and get an error that ends like this:\\npipenv.vendor.plette.models.base.ValidationError: {'python_version': '3.9', 'python_full_version': '3.9.13'}\\npython_full_version: 'python_version' must not be present with 'python_full_version'\\npython_version: 'python_full_version' must not be present with 'python_version'\\nDo this:\\nopen Pipfile in nano editor, and remove either the python_version or python_full_version line, press CTRL+X, type Y and click Enter to save changed\\nType pipenv lock to create the Pipfile.lock.\\nDone. Continue what you were doing\", 'section': '5. Deploying Machine Learning Models', 'question': 'Python_version and Python_full_version error after running pipenv install:', 'course': 'machine-learning-zoomcamp', 'id': 'a5b3296b'}\n", "{'text': 'If during running the docker build command, you get an error like this:\\nYour Pipfile.lock (221d14) is out of date. Expected: (939fe0).\\nUsage: pipenv install [OPTIONS] [PACKAGES]...\\nERROR:: Aborting deploy\\nOption 1: Delete the pipfile.lock via rm Pipfile, and then rebuild the lock via pipenv lock from the terminal before retrying the docker build command.\\nOption 2: If it still doesn’t work, remove the pipenv environment, Pipfile and Pipfile.lock, and create a new one before building docker again. Commands to remove pipenv environment and removing pipfiles:\\npipenv --rm\\nrm Pipfile*', 'section': '5. Deploying Machine Learning Models', 'question': 'Your Pipfile.lock (221d14) is out of date (during Docker build)', 'course': 'machine-learning-zoomcamp', 'id': 'a23b276a'}\n", "{'text': 'Ans: Pip uninstall waitress mflow. Then reinstall just mlflow. By this time you should have successfully built your docker image so you dont need to reinstall waitress. All good. Happy learning.\\nAdded by 🅱🅻🅰🆀', 'section': '5. Deploying Machine Learning Models', 'question': 'You are using windows. Conda environment. You then use waitress instead of gunicorn. After a few runs, suddenly mlflow server fails to run.', 'course': 'machine-learning-zoomcamp', 'id': '3537eeee'}\n", "{'text': \"Ans: so you have created the env. You need to make sure you're in eu-west-1 (ireland) when you check the EB environments. Maybe you're in a different region in your console.\\nAdded by Edidiong Esu\", 'section': '5. Deploying Machine Learning Models', 'question': 'Completed creating the environment locally but could not find the environment on AWS.', 'course': 'machine-learning-zoomcamp', 'id': '1d6d5b51'}\n", "{'text': 'Running \\'pip install waitress\\' as a command on GitBash was not downloading the executable file \\'waitress-serve.exe\\'. You need this file to be able to run commands with waitress in Git Bash. To solve this:\\nopen a Jupyter notebook and run the same command \\' pip install waitress\\'. This way the executable file will be downloaded. The notebook may give you this warning : \\'WARNING: The script waitress-serve.exe is installed in \\'c:\\\\Users\\\\....\\\\anaconda3\\\\Scripts\\' which is not on PATH. Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\\'\\nAdd the path where \\'waitress-serve.exe\\' is installed into gitbash\\'s PATH as such:\\nenter the following command in gitbash: nano ~/.bashrc\\nadd the path to \\'waitress-serve.exe\\' to PATH using this command: export PATH=\"/path/to/waitress:$PATH\"\\nclose gitbash and open it again and you should be good to go\\nAdded by Bachar Kabalan', 'section': '5. Deploying Machine Learning Models', 'question': 'Installing waitress on Windows via GitBash: “waitress-serve” command not found', 'course': 'machine-learning-zoomcamp', 'id': '3a98b6b7'}\n", "{'text': 'Q2.1: Use Pipenv to install Scikit-Learn version 1.3.1\\nThis is an error I got while executing the above step in the ml-zoomcamp conda environment. The error is not fatal and just warns you that explicit language specifications are not set out in our bash profile. A quick-fix is here:\\nhttps://stackoverflow.com/questions/49436922/getting-error-while-trying-to-run-this-command-pipenv-install-requests-in-ma\\nBut one can proceed without addressing it.\\nAdded by Abhirup Ghosh', 'section': '5. Deploying Machine Learning Models', 'question': 'Warning: the environment variable LANG is not set!', 'course': 'machine-learning-zoomcamp', 'id': 'd42eb923'}\n", "{'text': 'The provided image FROM svizor/zoomcamp-model:3.10.12-slim has a model and dictvectorizer that should be used for question 6. \"model2.bin\", \"dv.bin\"\\nAdded by Quinn Avila', 'section': '5. Deploying Machine Learning Models', 'question': 'Module5 HW Question 6', 'course': 'machine-learning-zoomcamp', 'id': '42aebe10'}\n", "{'text': 'https://apps.microsoft.com/detail/windows-terminal/9N0DX20HK701?hl=es-419&gl=CO\\nAdded by Dawuta Smit', 'section': '5. Deploying Machine Learning Models', 'question': 'Terminal Used in Week 5 videos:', 'course': 'machine-learning-zoomcamp', 'id': 'e4f62713'}\n", "{'text': \"Question:\\nWhen running\\npipenv run waitress-serve --listen=localhost:9696 q4-predict:app\\nI get the following:\\nThere was an exception (ValueError) importing your module.\\nIt had these arguments:\\n1. Malformed application 'q4-predict:app'\\nAnswer:\\nWaitress doesn’t accept a dash in the python file name.\\nThe solution is to rename the file replacing a dash with something else for instance with an underscore eg q4_predict.py\\nAdded by Alex Litvinov\", 'section': '5. Deploying Machine Learning Models', 'question': 'waitress-serve shows Malformed application', 'course': 'machine-learning-zoomcamp', 'id': 'c13d811f'}\n", "{'text': 'I wanted to have a fast and simple way to check if the HTTP POST requests are working just running a request from command line. This can be done running ‘curl’. \\n(Used with WSL2 on Windows, should also work on Linux and MacOS)\\ncurl --json \\'\\' \\n# piping the structure to the command\\ncat | curl --json @- \\necho \\'\\' | curl --json @- \\n# example using piping\\necho \\'{\"job\": \"retired\", \"duration\": 445, \"poutcome\": \"success\"}\\'\\\\\\n| curl --json @- http://localhost:9696/predict\\nAdded by Sylvia Schmitt', 'section': '5. Deploying Machine Learning Models', 'question': 'Testing HTTP POST requests from command line using curl', 'course': 'machine-learning-zoomcamp', 'id': 'dfb41f7e'}\n", "{'text': 'Question:\\nWhen executing\\neb local run --port 9696\\nI get the following error:\\nERROR: NotSupportedError - You can use \"eb local\" only with preconfigured, generic and multicontainer Docker platforms.\\nAnswer:\\nThere are two options to fix this:\\nRe-initialize by running eb init -i and choosing the options from a list (the first default option for docker platform should be fine).\\nEdit the ‘.elasticbeanstalk/config.yml’ directly changing the default_platform from Docker to default_platform: Docker running on 64bit Amazon Linux 2023\\nThe disadvantage of the second approach is that the option might not be available the following years\\nAdded by Alex Litvinov', 'section': '5. Deploying Machine Learning Models', 'question': 'NotSupportedError - You can use \"eb local\" only with preconfigured, generic and multicontainer Docker platforms.', 'course': 'machine-learning-zoomcamp', 'id': 'd04e77f8'}\n", "{'text': \"You need to include the protocol scheme: 'http://localhost:9696/predict'.\\nWithout the http:// part, requests has no idea how to connect to the remote server.\\nNote that the protocol scheme must be all lowercase; if your URL starts with HTTP:// for example, it won’t find the http:// connection adapter either.\\nAdded by George Chizhmak\", 'section': '5. Deploying Machine Learning Models', 'question': \"Requests Error: No connection adapters were found for 'localhost:9696/predict'.\", 'course': 'machine-learning-zoomcamp', 'id': '451c067f'}\n", "{'text': 'While running the docker image if you get the same result check which model you are using.\\nRemember you are using a model downloading model + python version so remember to change the model in your file when running your prediction test.\\nAdded by Ahmed Okka', 'section': '5. Deploying Machine Learning Models', 'question': 'Getting the same result', 'course': 'machine-learning-zoomcamp', 'id': '9fbfcd61'}\n", "{'text': 'Ensure that you used pipenv to install the necessary modules including gunicorn. As pipfiles for virtual environments, you can use pipenv shell and then build+run your docker image. - Akshar Goyal', 'section': '5. Deploying Machine Learning Models', 'question': 'Trying to run a docker image I built but it says it’s unable to start the container process', 'course': 'machine-learning-zoomcamp', 'id': '1ed8cfde'}\n", "{'text': \"You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\\nHrithik Kumar Advani\", 'section': '5. Deploying Machine Learning Models', 'question': 'How do I copy files from my local machine to docker container?', 'course': 'machine-learning-zoomcamp', 'id': '3f97f50f'}\n", "{'text': 'You can copy files from your local machine into a Docker container using the docker cp command. Here\\'s how to do it:\\nIn the Dockerfile, you can provide the folder containing the files that you want to copy over. The basic syntax is as follows:\\nCOPY [\"src/predict.py\", \"models/xgb_model.bin\", \"./\"]\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tGopakumar Gopinathan', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I copy files from a different folder into docker container’s working directory?', 'course': 'machine-learning-zoomcamp', 'id': 'a24a874a'}\n", "{'text': 'I struggled with the command :\\neb init -p docker tumor-diagnosis-serving -r eu-west-1\\nWhich resulted in an error when running : eb local run --port 9696\\nERROR: NotSupportedError - You can use \"eb local\" only with preconfigured, generic and multicontainer Docker platforms.\\nI replaced it with :\\neb init -p \"Docker running on 64bit Amazon Linux 2\" tumor-diagnosis-serving -r eu-west-1\\nThis allowed the recognition of the Dockerfile and the build/run of the docker container.\\nAdded by Mélanie Fouesnard', 'section': '5. Deploying Machine Learning Models', 'question': 'I can’t create the environment on AWS Elastic Beanstalk with the command proposed during the video', 'course': 'machine-learning-zoomcamp', 'id': 'bf563b1f'}\n", "{'text': \"I had this error when creating a AWS ElasticBean environment: eb create tumor-diagnosis-env\\nERROR Instance deployment: Both 'Dockerfile' and 'Dockerrun.aws.json' are missing in your source bundle. Include at least one of them. The deployment failed.\\nI did not committed the files used to build the container, particularly the Dockerfile. After a git add and git commit of the modified files, the command works.\\nAdded by Mélanie Fouesnard\", 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Dockerfile missing when creating the AWS ElasticBean environment', 'course': 'machine-learning-zoomcamp', 'id': '21e9facf'}\n", "{'text': 'Week 6 HW: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/06-trees/homework.md\\nAll HWs: https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2023/\\nHW 4 Solution: https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/course-zoomcamp/cohorts/2022/04-evaluation/homework_4.ipynb\\nEvaluation Matrix: https://docs.google.com/spreadsheets/d/e/2PACX-1vQCwqAtkjl07MTW-SxWUK9GUvMQ3Pv_fF8UadcuIYLgHa0PlNu9BRWtfLgivI8xSCncQs82HDwGXSm3/pubhtml\\nGitHub for theory: https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/course-zoomcamp\\nYouTube Link: 6.X --- https://www.youtube.com/watch?v=GJGmlfZoCoU&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=57\\nFAQs: https://docs.google.com/document/d/1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8/edit#heading=h.lpz96zg7l47j\\n~~~Nukta Bhatia~~~', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'How to get started with Week 6?', 'course': 'machine-learning-zoomcamp', 'id': 'aef786aa'}\n", "{'text': 'During the XGBoost lesson, we created a parser to extract the training and validation auc from the standard output. However, we can accomplish that in a more straightforward way.\\nWe can use the evals_result parameters, which takes an empty dictionary and updates it for each tree. Additionally, you can store the data in a dataframe and plot it in an easier manner.\\nAdded by Daniel Coronel', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'How to get the training and validation metrics from XGBoost?', 'course': 'machine-learning-zoomcamp', 'id': '68858294'}\n", "{'text': 'You should create sklearn.ensemble.RandomForestRegressor object. It’s rather similar to sklearn.ensemble.RandomForestClassificator for classification problems. Check https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html for more information.\\nAlena Kniazeva', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'How to solve regression problems with random forest in scikit-learn?', 'course': 'machine-learning-zoomcamp', 'id': '85ac722e'}\n", "{'text': 'In question 6, I was getting ValueError: feature_names must be string, and may not contain [, ] or < when I was creating DMatrix for train and validation\\nSolution description\\nThe cause of this error is that some of the features names contain special characters like = and <, and I fixed the error by removing them as follows:\\nfeatures= [i.replace(\"=<\", \"_\").replace(\"=\",\"_\") for i in features]\\nAsia Saeed\\nAlternative Solution:\\nIn my case the equal sign “=” was not a problem, so in my opinion the first part of Asias solution features= [i.replace(\"=<\", \"_\") should work as well.\\nFor me this works:\\nfeatures = []\\nfor f in dv.feature_names_:\\nstring = f.replace(“=<”, “-le”)\\nfeatures.append(string)\\nPeter Ernicke', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'ValueError: feature_names must be string, and may not contain [, ] or <', 'course': 'machine-learning-zoomcamp', 'id': 'b61d2e92'}\n", "{'text': 'If you’re getting this error, It is likely because the feature names in dv.get_feature_names_out() are a np.ndarray instead of a list so you have to convert them into a list by using the to_list() method.\\nAli Osman', 'section': '6. Decision Trees and Ensemble Learning', 'question': \"`TypeError: Expecting a sequence of strings for feature names, got: ` when training xgboost model.\", 'course': 'machine-learning-zoomcamp', 'id': '8d7392cb'}\n", "{'text': \"If you’re getting TypeError:\\n“TypeError: Expecting a sequence of strings for feature names, got: ”,\\nprobably you’ve done this:\\nfeatures = dv.get_feature_names_out()\\nIt gets you np.ndarray instead of list. Converting to list list(features) will not fix this, read below.\\nIf you’re getting ValueError:\\n“ValueError: feature_names must be string, and may not contain [, ] or <”,\\nprobably you’ve either done:\\nfeatures = list(dv.get_feature_names_out())\\nor:\\nfeatures = dv.feature_names_\\nreason is what you get from DictVectorizer here looks like this:\\n['households',\\n'housing_median_age',\\n'latitude',\\n'longitude',\\n'median_income',\\n'ocean_proximity=<1H OCEAN',\\n'ocean_proximity=INLAND',\\n'population',\\n'total_bedrooms',\\n'total_rooms']\\nit has symbols XGBoost doesn’t like ([, ] or <).\\nWhat you can do, is either do not specify “feature_names=” while creating xgb.DMatrix or:\\nimport re\\nfeatures = dv.feature_names_\\npattern = r'[\\\\[\\\\]<>]'\\nfeatures = [re.sub(pattern, ' ', f) for f in features]\\nAdded by Andrii Larkin\", 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Q6: ValueError or TypeError while setting xgb.DMatrix(feature_names=)', 'course': 'machine-learning-zoomcamp', 'id': 'c920eef3'}\n", "{'text': 'To install Xgboost, use the code below directly in your jupyter notebook:\\n(Pip 21.3+ is required)\\npip install xgboost\\nYou can update your pip by using the code below:\\npip install --upgrade pip\\nFor more about xgbboost and installation, check here:\\nhttps://xgboost.readthedocs.io/en/stable/install.html\\nAminat Abolade', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'How to Install Xgboost', 'course': 'machine-learning-zoomcamp', 'id': '5017c9a4'}\n", "{'text': 'Sometimes someone might wonder what eta means in the tunable hyperparameters of XGBoost and how it helps the model.\\nETA is the learning rate of the model. XGBoost uses gradient descent to calculate and update the model. In gradient descent, we are looking for the minimum weights that help the model to learn the data very well. This minimum weights for the features is updated each time the model passes through the features and learns the features during training. Tuning the learning rate helps you tell the model what speed it would use in deriving the minimum for the weights.', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'What is eta in XGBoost', 'course': 'machine-learning-zoomcamp', 'id': '6ffe101d'}\n", "{'text': 'For ensemble algorithms, during the week 6, one bagging algorithm and one boosting algorithm were presented: Random Forest and XGBoost, respectively.\\nRandom Forest trains several models in parallel. The output can be, for example, the average value of all the outputs of each model. This is called bagging.\\nXGBoost trains several models sequentially: the previous model error is used to train the following model. Weights are used to ponderate the models such as the best models have higher weights and are therefore favored for the final output. This method is called boosting.\\nNote that boosting is not necessarily better than bagging.\\nMélanie Fouesnard\\nBagging stands for “Bootstrap Aggregation” - it involves taking multiple samples with replacement to derive multiple training datasets from the original training dataset (bootstrapping), training a classifier (e.g. decision trees or stumps for Random Forests) on each such training dataset, and then combining the the predictions (aggregation) to obtain the final prediction. For classification, predictions are combined via voting; for regression, via averaging. Bagging can be done in parallel, since the various classifiers are independent. Bagging decreases variance (but not bias) and is robust against overfitting.\\nBoosting, on the other hand, is sequential - each model learns from the mistakes of its predecessor. Observations are given different weights - observations/samples misclassified by the previous classifier are given a higher weight, and this process is continued until a stopping condition is reached (e.g. max. No. of models is reached, or error is acceptably small, etc.). Boosting reduces bias & is generally more accurate than bagging, but can be prone to overfitting.\\nRileen', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'What is the difference between bagging and boosting?', 'course': 'machine-learning-zoomcamp', 'id': 'a55b29ff'}\n", "{'text': 'I wanted to directly capture the output from the xgboost training for multiple eta values to a dictionary without the need to run the same cell multiple times and manually editing the eta value in between or copy the code for a second eta value.\\nUsing the magic cell command “%%capture output” I was only able to capture the complete output for all iterations for the loop, but. I was able to solve this using the following approach. This is just a code sample to grasp the idea.\\n# This would be the content of the Jupyter Notebook cell\\nfrom IPython.utils.capture import capture_output\\nimport sys\\ndifferent_outputs = {}\\nfor i in range(3):\\nwith capture_output(sys.stdout) as output:\\nprint(i)\\nprint(\"testing capture\")\\ndifferent_outputs[i] = output.stdout\\n# different_outputs\\n# {0: \\'0\\\\ntesting capture\\\\n\\',\\n# 1: \\'1\\\\ntesting capture\\\\n\\',\\n# 2: \\'2\\\\ntesting capture\\\\n\\'}\\nAdded by Sylvia Schmitt', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Capture stdout for each iterations of a loop separately', 'course': 'machine-learning-zoomcamp', 'id': 'eac70ce3'}\n", "{'text': 'Calling roc_auc_score() to get auc is throwing the above error.\\nSolution to this issue is to make sure that you pass y_actuals as 1st argument and y_pred as 2nd argument.\\nroc_auc_score(y_train, y_pred)\\nHareesh Tummala', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'ValueError: continuous format is not supported', 'course': 'machine-learning-zoomcamp', 'id': '5f91f8ca'}\n", "{'text': 'When rmse stops improving means, when it stops to decrease or remains almost similar.\\nPastor Soto', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Question 3 of homework 6 if i see that rmse goes up at a certain number of n_estimators but then goes back down lower than it was before, should the answer be the number of n_estimators after which rmse initially went up, or the number after which it was its overall lowest value?', 'course': 'machine-learning-zoomcamp', 'id': 'a3be507a'}\n", "{'text': 'dot_data = tree.export_graphviz(regr, out_file=None,\\nfeature_names=boston.feature_names,\\nfilled=True)\\ngraphviz.Source(dot_data, format=\"png\")\\nKrishna Anand\\nfrom sklearn import tree\\ntree.plot_tree(dt,feature_names=dv.feature_names_)\\nAdded By Ryan Pramana', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'One of the method to visualize the decision trees', 'course': 'machine-learning-zoomcamp', 'id': '9a8faa50'}\n", "{'text': 'Solution: This problem happens because you use DecisionTreeClassifier instead of DecisionTreeRegressor. You should check if you want to use a Decision tree for classification or regression.\\nAlejandro Aponte', 'section': '6. Decision Trees and Ensemble Learning', 'question': \"ValueError: Unknown label type: 'continuous'\", 'course': 'machine-learning-zoomcamp', 'id': 'a6e384fe'}\n", "{'text': 'When I run dt = DecisionTreeClassifier() in jupyter in same laptop, each time I re-run it or do (restart kernel + run) I get different values of auc. Some of them are 0.674, 0.652, 0.642, 0.669 and so on. Anyone knows why it could be? I am referring to 7:40-7:45 of video 6.3.\\nSolution: try setting the random seed e.g\\ndt = DecisionTreeClassifier(random_state=22)\\nBhaskar Sarma', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Different values of auc, each time code is re-run', 'course': 'machine-learning-zoomcamp', 'id': 'ddc14ada'}\n", "{'text': \"They both do the same, it's just less typing from the script.\\nAsked by Andrew Katoch, Added by Edidiong Esu\", 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?', 'course': 'machine-learning-zoomcamp', 'id': '593f7569'}\n", "{'text': 'When I tried to run example from the video using function ping and can not import it. I use import ping and it was unsuccessful. To fix it I use the statement:\\n\\nfrom [file name] import ping\\nOlga Rudakova', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'No module named ‘ping’?', 'course': 'machine-learning-zoomcamp', 'id': '6cb56405'}\n", "{'text': 'The DictVectorizer has a function to get the feature names get_feature_names_out(). This is helpful for example if you need to analyze feature importance but use the dict vectorizer for one hot encoding. Just keep in mind it does return a numpy array so you may need to convert this to a list depending on your usage for example dv.get_feature_names_out() will return a ndarray array of string objects. list(dv.get_feature_names_out()) will convert to a standard list of strings. Also keep in mind that you first need to fit the predictor and response arrays before you have access to the feature names.\\nQuinn Avila', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'DictVectorizer feature names', 'course': 'machine-learning-zoomcamp', 'id': 'a22a93f1'}\n", "{'text': \"They both do the same, it's just less typing from the script.\", 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?', 'course': 'machine-learning-zoomcamp', 'id': '593f7569'}\n", "{'text': 'This error occurs because the list of feature names contains some characters like \"<\" that are not supported. To fix this issue, you can replace those problematic characters with supported ones. If you want to create a consistent list of features with no special characters, you can achieve it like this:\\nYou can address this error by replacing problematic characters in the feature names with underscores, like so:\\nfeatures = [f.replace(\\'=<\\', \\'_\\').replace(\\'=\\', \\'_\\') for f in features]\\nThis code will go through the list of features and replace any instances of \"=<\" with \"\", as well as any \"=\" with \"\", ensuring that the feature names only consist of supported characters.', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'ValueError: feature_names must be string, and may not contain [, ] or <', 'course': 'machine-learning-zoomcamp', 'id': 'b6259dea'}\n", "{'text': \"To make it easier for us to determine which features are important, we can use a horizontal bar chart to illustrate feature importance sorted by value.\\n1. # extract the feature importances from the model\\nfeature_importances = list(zip(features_names, rdr_model.feature_importances_))\\nimportance_df = pd.DataFrame(feature_importances, columns=['feature_names', 'feature_importances'])\\n2. # sort descending the dataframe by using feature_importances value\\nimportance_df = importance_df.sort_values(by='feature_importances', ascending=False)\\n3. # create a horizontal bar chart\\nplt.figure(figsize=(8, 6))\\nsns.barplot(x='feature_importances', y='feature_names', data=importance_df, palette='Blues_r')\\nplt.xlabel('Feature Importance')\\nplt.ylabel('Feature Names')\\nplt.title('Feature Importance Chart')\\nRadikal Lukafiardi\", 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Visualize Feature Importance by using horizontal bar chart', 'course': 'machine-learning-zoomcamp', 'id': 'bcfdc6f4'}\n", "{'text': 'Instead of using np.sqrt() as the second step. You can extract it using like this way :\\nmean_squared_error(y_val, y_predict_val,squared=False)\\nAhmed Okka', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'RMSE using metrics.root_meas_square()', 'course': 'machine-learning-zoomcamp', 'id': 'a7e7cdd2'}\n", "{'text': 'I like this visual implementation of features importance in scikit-learn library:\\nhttps://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html\\nIt actually adds std.errors to features importance -> so that you can trace stability of features (important for a model’s explainability) over the different params of the model.\\nIvan Brigida', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Features Importance graph', 'course': 'machine-learning-zoomcamp', 'id': '55477da8'}\n", "{'text': 'Expanded error says: xgboost.core.XGBoostError: sklearn needs to be installed in order to use this module. So, sklearn in requirements solved the problem.\\nGeorge Chizhmak', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'xgboost.core.XGBoostError: This app has encountered an error. The original error message is redacted to prevent data leaks.', 'course': 'machine-learning-zoomcamp', 'id': '6a245a05'}\n", "{'text': 'Information gain in Y due to X, or the mutual information of Y and X\\nWhere is the entropy of Y. \\n\\nIf X is completely uninformative about Y:\\nIf X is completely informative about Y: )\\nHrithik Kumar Advani', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Information Gain', 'course': 'machine-learning-zoomcamp', 'id': '4405bfca'}\n", "{'text': 'Filling in missing values using an entire dataset before splitting for training/testing/validation causes', 'section': '6. Decision Trees and Ensemble Learning', 'question': 'Data Leakage', 'course': 'machine-learning-zoomcamp', 'id': '3e0acc25'}\n", "{'text': 'Save model by calling ‘booster.save_model’, see eg\\nLoad model:\\nDawuta Smit\\nThis section is moved to Projects', 'section': '8. Neural Networks and Deep Learning', 'question': 'Serialized Model Xgboost error', 'course': 'machine-learning-zoomcamp', 'id': 'abaecdf8'}\n", "{'text': 'TODO', 'section': '8. Neural Networks and Deep Learning', 'question': 'How to get started with Week 8?', 'course': 'machine-learning-zoomcamp', 'id': 'ff40f83b'}\n", "{'text': 'Create or import your notebook into Kaggle.\\nClick on the Three dots at the top right hand side\\nClick on Accelerator\\nChoose T4 GPU\\nKhurram Majeed', 'section': '8. Neural Networks and Deep Learning', 'question': 'How to use Kaggle for Deep Learning?', 'course': 'machine-learning-zoomcamp', 'id': '95a16746'}\n", "{'text': 'Create or import your notebook into Google Colab.\\nClick on the Drop Down at the top right hand side\\nClick on “Change runtime type”\\nChoose T4 GPU\\nKhurram Majeed', 'section': '8. Neural Networks and Deep Learning', 'question': 'How to use Google Colab for Deep Learning?', 'course': 'machine-learning-zoomcamp', 'id': '46acdd18'}\n", "{'text': 'Connecting your GPU on Saturn Cloud to Github repository is not compulsory, since you can just download the notebook and copy it to the Github folder. But if you like technology to do things for you, then follow the solution description below:\\nSolution description: Follow the instructions in these github docs to create an SSH private and public key:\\nhttps://docs.github.com/en/authentication/connecting-to-github-with-ssh/generating-a-new-ssh-ke\\ny-and-adding-it-to-the-ssh-agenthttps://docs.github.com/en/authentication/connecting-to-github-with-ssh/adding-a-new-ssh-key-to-your-github-account?tool=webui\\nThen the second video on this module about saturn cloud would show you how to add the ssh keys to secrets and authenticate through a terminal.\\nOr alternatively, you could just use the public keys provided by Saturn Cloud by default. To do so, follow these steps:\\nClick on your username and on manage\\nDown below you will see the Git SSH keys section.\\nCopy the default public key provided by Saturn Cloud\\nPaste these key into the SSH keys section of your github repo\\nOpen a terminal on Saturn Cloud and run this command “ssh -T git@github.com”\\nYou will receive a successful authentication notice.\\nOdimegwu David', 'section': '8. Neural Networks and Deep Learning', 'question': 'How do I push from Saturn Cloud to Github?', 'course': 'machine-learning-zoomcamp', 'id': 'f721d54b'}\n", "{'text': 'This template is referred to in the video 8.1b Setting up the Environment on Saturn Cloud\\nbut the location shown in the video is no longer correct.\\nThis template has been moved to “python deep learning tutorials’ which is shown on the Saturn Cloud home page.\\nSteven Christolis', 'section': '8. Neural Networks and Deep Learning', 'question': 'Where is the Python TensorFlow template on Saturn Cloud?', 'course': 'machine-learning-zoomcamp', 'id': '69cd4897'}\n", "{'text': 'The above error happens since module scipy is not installed in the saturn cloud tensorflow image. While creating the Jupyter server resource, in the “Extra Packages” section under pip in the textbox write scipy. Below the textbox, the pip install scipy command will be displayed. This will ensure when the resource spins up, the scipy package will be automatically installed. This approach can also be followed for additional python packages.\\nSumeet Lalla', 'section': '8. Neural Networks and Deep Learning', 'question': 'Getting error module scipy not found during model training in Saturn Cloud tensorflow image', 'course': 'machine-learning-zoomcamp', 'id': '346e799a'}\n", "{'text': 'Problem description: Uploading the data to saturn cloud from kaggle can be time saving, specially if the dataset is large.\\nYou can just download to your local machine and then upload to a folder on saturn cloud, but there is a better solution that needs to be set once and you have access to all kaggle datasets in saturn cloud.\\nOn your notebook run:\\n!pip install -q kaggle\\nGo to Kaggle website (you need to have an account for this):\\nClick on your profile image -> Account\\nScroll down to the API box\\nClick on Create New API token\\nIt will download a json file with the name kaggle.json store on your local computer. We need to upload this file in the .kaggle folder\\nOn the notebook click on folder icon on the left upper corner\\nThis will take you to the root folder\\nClick on the .kaggle folder\\nOnce inside of the .kaggle folder upload the kaggle.json file that you downloaded\\nRun this command on your notebook:\\n!chmod 600 /home/jovyan/.kaggle/kaggle.json\\nDownload the data using this command:\\n!kaggle datasets download -d agrigorev/dino-or-dragon\\nCreate a folder to unzip your files:\\n!mkdir data\\nUnzip your files inside that folder\\n!unzip dino-or-dragon.zip -d data\\nPastor Soto', 'section': '8. Neural Networks and Deep Learning', 'question': 'How to upload kaggle data to Saturn Cloud?', 'course': 'machine-learning-zoomcamp', 'id': '551461b2'}\n", "{'text': 'In order to run tensorflow with gpu on your local machine you’ll need to setup cuda and cudnn.\\nThe process can be overwhelming. Here’s a simplified guide\\nOsman Ali', 'section': '8. Neural Networks and Deep Learning', 'question': 'How to install CUDA & cuDNN on Ubuntu 22.04', 'course': 'machine-learning-zoomcamp', 'id': 'c3ba4459'}\n", "{'text': 'Problem description:\\nWhen loading saved model getting error: ValueError: Unable to load weights saved in HDF5 format into a subclassed Model which has not created its variables yet. Call the Model first, then load the weights.\\nSolution description:\\nBefore loading model need to evaluate the model on input data: model.evaluate(train_ds)\\nAdded by Vladimir Yesipov', 'section': '8. Neural Networks and Deep Learning', 'question': 'Error: (ValueError: Unable to load weights saved in HDF5 format into a subclassed Model which has not created its variables yet. Call the Model first, then load the weights.) when loading model.', 'course': 'machine-learning-zoomcamp', 'id': 'a114ad55'}\n", "{'text': 'Problem description:\\nWhen follow module 8.1b video to setup git in Saturn Cloud, run `ssh -T git@github.com` lead error `git@github.com: Permission denied (publickey).`\\nSolution description:\\nAlternative way, we can setup git in our Saturn Cloud env with generate SSH key in our Saturn Cloud and add it to our git account host. After it, we can access/manage our git through Saturn’s jupyter server. All steps detailed on this following tutorial: https://saturncloud.io/docs/using-saturn-cloud/gitrepo/\\nAdded by Ryan Pramana', 'section': '8. Neural Networks and Deep Learning', 'question': 'Getting error when connect git on Saturn Cloud: permission denied', 'course': 'machine-learning-zoomcamp', 'id': 'dd3c8000'}\n", "{'text': \"Problem description:\\nGetting an error using \\nThe error:\\nCloning into 'clothing-dataset'...\\nHost key verification failed.\\nfatal: Could not read from remote repository.\\nPlease make sure you have the correct access rights\\nand the repository exists.\\nSolution description:\\nwhen cloning the repo, you can also chose https - then it should work. This happens when you don't have your ssh key configured.\\n\\nAdded by Gregory Morris\", 'section': '8. Neural Networks and Deep Learning', 'question': 'Host key verification failed.', 'course': 'machine-learning-zoomcamp', 'id': '34b0ebfc'}\n", "{'text': \"Problem description\\nThe accuracy and the loss are both still the same or nearly the same while training.\\nSolution description\\nIn the homework, you should set class_mode='binary' while reading the data.\\nAlso, problem occurs when you choose the wrong optimizer, batch size, or learning rate\\nAdded by Ekaterina Kutovaia\", 'section': '8. Neural Networks and Deep Learning', 'question': 'The same accuracy on epochs', 'course': 'machine-learning-zoomcamp', 'id': '7d11d5ce'}\n", "{'text': 'Problem:\\nWhen resuming training after augmentation, the loss skyrockets (1000+ during first epoch) and accuracy settles around 0.5 – i.e. the model becomes as good as a random coin flip.\\nSolution:\\nCheck that the augmented ImageDataGenerator still includes the option “rescale” as specified in the preceding step.\\nAdded by Konrad Mühlberg', 'section': '8. Neural Networks and Deep Learning', 'question': 'Model breaking after augmentation – high loss + bad accuracy', 'course': 'machine-learning-zoomcamp', 'id': 'e4e45f15'}\n", "{'text': \"While doing:\\nimport tensorflow as tf\\nfrom tensorflow import keras\\nmodel = tf.keras.models.load_model('model_saved.h5')\\nIf you get an error message like this:\\nValueError: The channel dimension of the inputs should be defined. The input_shape received is (None, None, None, None), where axis -1 (0-based) is the channel dimension, which found to be `None`.\\nSolution:\\nSaving a model (either yourself via model.save() or via checkpoint when save_weights_only = False) saves two things: The trained model weights (for example the best weights found during training) and the model architecture. If the number of channels is not explicitly specified in the Input layer of the model, and is instead defined as a variable, the model architecture will not have the value in the variable stored. Therefore when the model is reloaded, it will complain about not knowing the number of channels. See the code below, in the first line, you need to specify number of channels explicitly:\\n# model architecture:\\ninputs = keras.Input(shape=(input_size, input_size, 3))\\nbase = base_model(inputs, training=False)\\nvectors = keras.layers.GlobalAveragePooling2D()(base)\\ninner = keras.layers.Dense(size_inner, activation='relu')(vectors)\\ndrop = keras.layers.Dropout(droprate)(inner)\\noutputs = keras.layers.Dense(10)(drop)\\nmodel = keras.Model(inputs, outputs)\\n(Memoona Tahira)\", 'section': '8. Neural Networks and Deep Learning', 'question': 'Missing channel value error while reloading model:', 'course': 'machine-learning-zoomcamp', 'id': 'b3997e6f'}\n", "{'text': \"Problem:\\nA dataset for homework is in a zipped folder. If you unzip it within a jupyter notebook by means of ! unzip command, you’ll see a huge amount of output messages about unzipping of each image. So you need to suppress this output\\nSolution:\\nExecute the next cell:\\n%%capture\\n! unzip zipped_folder_name.zip -d destination_folder_name\\nAdded by Alena Kniazeva\\nInside a Jupyter Notebook:\\nimport zipfile\\nlocal_zip = 'data.zip'\\nzip_ref = zipfile.ZipFile(local_zip, 'r')\\nzip_ref.extractall('data')\\nzip_ref.close()\", 'section': '8. Neural Networks and Deep Learning', 'question': 'How to unzip a folder with an image dataset and suppress output?', 'course': 'machine-learning-zoomcamp', 'id': 'e414df91'}\n", "{'text': 'Problem:\\nWhen we run train_gen.flow_from_directory() as in video 8.5, it finds images belonging to 10 classes. Does it understand the names of classes from the names of folders? Or, there is already something going on deep behind?\\nSolution:\\nThe name of class is the folder name\\nIf you just create some random folder with the name \"xyz\", it will also be considered as a class!! The name itself is saying flow_from_directory\\na clear explanation below:\\nhttps://vijayabhaskar96.medium.com/tutorial-image-classification-with-keras-flow-from-directory-and-generators-95f75ebe5720\\nAdded by Bhaskar Sarma', 'section': '8. Neural Networks and Deep Learning', 'question': 'How keras flow_from_directory know the names of classes in images?', 'course': 'machine-learning-zoomcamp', 'id': 'f20a3479'}\n", "{'text': 'Problem:\\nI created a new environment in SaturnCloud and chose the image corresponding to Saturn with Tensorflow, but when I tried to fit the model it showed an error about the missing module: scipy\\nSolution:\\nInstall the module in a new cell: !pip install scipy\\nRestart the kernel and fit the model again\\nAdded by Erick Calderin', 'section': '8. Neural Networks and Deep Learning', 'question': 'Error with scipy missing module in SaturnCloud', 'course': 'machine-learning-zoomcamp', 'id': 'e7af4968'}\n", "{'text': 'The command to read folders in the dataset in the tensorflow source code is:\\nfor subdir in sorted(os.listdir(directory)):\\n…\\nReference: https://github.com/keras-team/keras/blob/master/keras/preprocessing/image.py, line 563\\nThis means folders will be read in alphabetical order. For example, in the case of a folder named dino, and another named dragon, dino will read first and will have class label 0, whereas dragon will be read in next and will have class label 1.\\nWhen a Keras model predicts binary labels, it will only return one value, and this is the probability of class 1 in case of sigmoid activation function in the last dense layer with 2 neurons. The probability of class 0 can be found out by:\\nprob(class(0)) = 1- prob(class(1))\\nIn case of using from_logits to get results, you will get two values for each of the labels.\\nA prediction of 0.8 is saying the probability that the image has class label 1 (in this case dragon), is 0.8, and conversely we can infer the probability that the image has class label 0 is 0.2.\\n(Added by Memoona Tahira)', 'section': '8. Neural Networks and Deep Learning', 'question': 'How are numeric class labels determined in flow_from_directroy using binary class mode and what is meant by the single probability predicted by a binary Keras model:', 'course': 'machine-learning-zoomcamp', 'id': '9fad096e'}\n", "{'text': \"It's fine, some small changes are expected\\nAlexey Grigorev\", 'section': '8. Neural Networks and Deep Learning', 'question': 'Does the actual values matter after predicting with a neural network or it should be treated as like hood of falling in a class?', 'course': 'machine-learning-zoomcamp', 'id': 'bcdf7407'}\n", "{'text': 'Problem:\\nI found running the wasp/bee model on my mac laptop had higher reported accuracy and lower std deviation than the HW answers. This may be because of the SGD optimizer. Running this on my mac printed a message about a new and legacy version that could be used.\\nSolution:\\nTry running the same code on google collab or another way. The answers were closer for me on collab. Another tip is to change the runtime to use T4 and the model run’s faster than just CPU\\nAdded by Quinn Avila', 'section': '8. Neural Networks and Deep Learning', 'question': 'What if your accuracy and std training loss don’t match HW?', 'course': 'machine-learning-zoomcamp', 'id': '8d1e7e20'}\n", "{'text': 'When running “model.fit(...)” an additional parameter “workers” can be specified for speeding up the data loading/generation. The default value is “1”. Try out which value between 1 and the cpu count on your system performs best.\\nhttps://www.tensorflow.org/api_docs/python/tf/keras/Model#fit\\nAdded by Sylvia Schmitt', 'section': '8. Neural Networks and Deep Learning', 'question': 'Using multi-threading for data generation in “model.fit()”', 'course': 'machine-learning-zoomcamp', 'id': '2023a9dc'}\n", "{'text': 'Reproducibility for training runs can be achieved following these instructions: \\nhttps://www.tensorflow.org/versions/r2.8/api_docs/python/tf/config/experimental/enable_op_determinism\\nseed = 1234\\ntf.keras.utils.set_random_seed(seed)\\ntf.config.experimental.enable_op_determinism()\\nThis will work for a script, if this gets executed multiple times.\\nAdded by Sylvia Schmitt', 'section': '8. Neural Networks and Deep Learning', 'question': 'Reproducibility with TensorFlow using a seed point', 'course': 'machine-learning-zoomcamp', 'id': '468f69ff'}\n", "{'text': 'Pytorch is also a deep learning framework that allows to do equivalent tasks as keras. Here is a tutorial to create a CNN from scratch using pytorch :\\nhttps://blog.paperspace.com/writing-cnns-from-scratch-in-pytorch/\\nThe functions have similar goals. The syntax can be slightly different. For the lessons and the homework, we use keras, but one can feel free to make a pull request with the equivalent with pytorch for lessons and homework!\\nMélanie Fouesnard', 'section': '8. Neural Networks and Deep Learning', 'question': 'Can we use pytorch for this lesson/homework ?', 'course': 'machine-learning-zoomcamp', 'id': 'c4ff26e5'}\n", "{'text': \"While training a Keras model you get the error “Failed to find data adapter that can handle input: , ” you may have unintentionally passed the image generator instead of the dataset to the model\\ntrain_gen = ImageDataGenerator(rescale=1./255)\\ntrain_ds = train_gen.flow_from_directory(…)\\nhistory_after_augmentation = model.fit(\\ntrain_gen, # this should be train_ds!!!\\nepochs=10,\\nvalidation_data=test_gen # this should be test_ds!!!\\n)\\nThe fix is simple and probably obvious once pointed out, use the training and validation dataset (train_ds and val_ds) returned from flow_from_directory\\nAdded by Tzvi Friedman\", 'section': '8. Neural Networks and Deep Learning', 'question': 'Keras model training fails with “Failed to find data adapter”', 'course': 'machine-learning-zoomcamp', 'id': '62722d72'}\n", "{'text': 'The command ‘nvidia-smi’ has a built-in function which will run it in subsequently updating it every N seconds without the need of using the command ‘watch’.\\nnvidia-smi -l \\nThe following command will run ‘nvidia-smi’ every 2 seconds until interrupted using CTRL+C.\\nnvidia-smi -l 2\\nAdded by Sylvia Schmitt', 'section': '8. Neural Networks and Deep Learning', 'question': 'Running ‘nvidia-smi’ in a loop without using ‘watch’', 'course': 'machine-learning-zoomcamp', 'id': 'd1419be1'}\n", "{'text': 'The Python package ‘’ is an interactive GPU process viewer similar to ‘htop’ for CPU.\\nhttps://pypi.org/project//\\nImage source: https://pypi.org/project//\\nAdded by Sylvia Schmitt', 'section': '8. Neural Networks and Deep Learning', 'question': 'Checking GPU and CPU utilization using ‘nvitop’', 'course': 'machine-learning-zoomcamp', 'id': 'a5f6f439'}\n", "{'text': \"Let’s say we define our Conv2d layer like this:\\n>> tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(150, 150, 3))\\nIt means our input image is RGB (3 channels, 150 by 150 pixels), kernel is 3x3 and number of filters (layer’s width) is 32.\\nIf we check model.summary() we will get this:\\n_________________________________________________________________\\nLayer (type) Output Shape Param #\\n=================================================================\\nconv2d (Conv2D) (None, 148, 148, 32) 896\\nSo where does 896 params come from? It’s computed like this:\\n>>> (3*3*3 +1) * 32\\n896\\n# 3x3 kernel, 3 channels RGB, +1 for bias, 32 filters\\nWhat about the number of “features” we get after the Flatten layer?\\nFor our homework model.summary() for last MaxPooling2d and Flatten layers looked like this:\\n_________________________________________________________________\\nLayer (type) Output Shape Param #\\n=================================================================\\nmax_pooling2d_3 (None, 7, 7, 128) 0\\nflatten (Flatten) (None, 6272) 0\\nSo where do 6272 vectors come from? It’s computed like this:\\n>>> 7*7*128\\n6272\\n# 7x7 “image shape” after several convolutions and poolings, 128 filters\\nAdded by Andrii Larkin\", 'section': '8. Neural Networks and Deep Learning', 'question': 'Q: Where does the number of Conv2d layer’s params come from? Where does the number of “features” we get after the Flatten layer come from?', 'course': 'machine-learning-zoomcamp', 'id': '879c1ec0'}\n", "{'text': 'It’s quite useful to understand that all types of models in the course are a plain stack of layers where each layer has exactly one input tensor and one output tensor (Sequential model TF page, Sequential class).\\nYou can simply start from an “empty” model and add more and more layers in a sequential order.\\nThis mode is called “Sequential Model API” (easier)\\nIn Alexey’s videos it is implemented as chained calls of different entities (“inputs”,“base”, “vectors”, “outputs”) in a more advanced mode “Functional Model API”.\\nMaybe a more complicated way makes sense when you do Transfer Learning and want to separate “Base” model vs. rest, but in the HW you need to recreate the full model from scratch ⇒ I believe it is easier to work with a sequence of “similar” layers.\\nYou can read more about it in this TF2 tutorial.\\nA really useful Sequential model example is shared in the Kaggle’s “Bee or Wasp” dataset folder with code: notebook\\nAdded by Ivan Brigida\\nFresh Run on Neural Nets\\nWhile correcting an error on neural net architecture, it is advised to do fresh run by restarting kernel, else the model learns on top of previous runs.\\nAdded by Abhijit Chakraborty', 'section': '8. Neural Networks and Deep Learning', 'question': 'Sequential vs. Functional Model Modes in Keras (TF2)', 'course': 'machine-learning-zoomcamp', 'id': '3ac604c3'}\n", "{'text': \"I found this code snippet fixed my OOM errors, as I have an Nvidia GPU. Can't speak to OOM errors on CPU, though.\\nhttps://www.tensorflow.org/api_docs/python/tf/config/experimental/set_memory_growth\\n```\\nphysical_devices = tf.configlist_physical_devices('GPU')\\ntry:\\ntf.config.experimental.set_memory_growth(physical_devices[0],True)\\nexcept:\\n# Invalid device or cannot modify virtual devices once initialized.\\npass\\n```\", 'section': '8. Neural Networks and Deep Learning', 'question': 'Out of memory errors when running tensorflow', 'course': 'machine-learning-zoomcamp', 'id': '0315aa96'}\n", "{'text': 'When training the models, in the fit function, you can specify the number of workers/threads.\\nThe number of threads apparently also works for GPUs, and came very handy in google colab for the T4 GPU, since it was very very slow, and workers default value is 1.\\nI changed the workers variable to 2560, following this thread in stackoverflow. I am using the free T4 GPU. (https://stackoverflow.com/questions/68208398/how-to-find-the-number-of-cores-in-google-colabs-gpu)\\nAdded by Ibai Irastorza', 'section': '8. Neural Networks and Deep Learning', 'question': 'Model training very slow in google colab with T4 GPU', 'course': 'machine-learning-zoomcamp', 'id': 'daf84bc3'}\n", "{'text': 'From the keras documentation:\\nDeprecated: tf.keras.preprocessing.image.ImageDataGenerator is not recommended for new code. Prefer loading images with tf.keras.utils.image_dataset_from_directory and transforming the output tf.data.Dataset with preprocessing layers. For more information, see the tutorials for loading images and augmenting images, as well as the preprocessing layer guide.\\nHrithik Kumar Advani', 'section': '9. Serverless Deep Learning', 'question': 'Using image_dataset_from_directory instead of ImageDataGeneratorn for loading images', 'course': 'machine-learning-zoomcamp', 'id': '1e956ca7'}\n", "{'text': 'TODO', 'section': '9. Serverless Deep Learning', 'question': 'How to get started with Week 9?', 'course': 'machine-learning-zoomcamp', 'id': '3ee083ab'}\n", "{'text': 'The week 9 uses a link to github to fetch the models.\\nThe original link was moved to here:\\nhttps://github.com/DataTalksClub/machine-learning-zoomcamp/releases', 'section': '9. Serverless Deep Learning', 'question': 'Where is the model for week 9?', 'course': 'machine-learning-zoomcamp', 'id': 'f826cba4'}\n", "{'text': 'Solution description\\nIn the unit 9.6, Alexey ran the command echo ${REMOTE_URI} which turned the URI address in the terminal. There workaround is to set a local variable (REMOTE_URI) and assign your URI address in the terminal and use it to login the registry, for instance, REMOTE_URI=2278222782.dkr.ecr.ap-south-1.amazonaws.com/clothing-tflite-images (fake address). One caveat is that you will lose this variable once the session is terminated.\\nI also had the same problem on Ubuntu terminal. I executed the following two commands:\\n$ export REMOTE_URI=1111111111.dkr.ecr.us-west-1.amazonaws.com/clothing-tflite-images:clothing-model-xception-v4-001\\n$ echo $REMOTE_URI\\n111111111.dkr.ecr.us-west-1.amazonaws.com/clothing-tflite-images:clothing-model-xception-v4-001\\nNote: 1. no curly brackets (e.g. echo ${REMOTE_URI}) needed unlike in video 9.6,\\n2. Replace REMOTE_URI with your URI\\n(Bhaskar Sarma)', 'section': '9. Serverless Deep Learning', 'question': 'Executing the command echo ${REMOTE_URI} returns nothing.', 'course': 'machine-learning-zoomcamp', 'id': '60fa95ed'}\n", "{'text': 'The command aws ecr get-login --no-include-email returns an invalid choice error:\\nThe solution is to use the following command instead: aws ecr get-login-password\\nCould simplify the login process with, just replace the and with your values:\\nexport PASSWORD=`aws ecr get-login-password`\\ndocker login -u AWS -p $PASSWORD .dkr.ecr..amazonaws.com/clothing-tflite-images\\nAdded by Martin Uribe', 'section': '9. Serverless Deep Learning', 'question': 'Getting a syntax error while trying to get the password from aws-cli', 'course': 'machine-learning-zoomcamp', 'id': '53f3ee10'}\n", "{'text': 'We can use the keras.models.Sequential() function to pass many parameters of the cnn at once.\\nKrishna Anand', 'section': '9. Serverless Deep Learning', 'question': 'Pass many parameters in the model at once', 'course': 'machine-learning-zoomcamp', 'id': '93aa4278'}\n", "{'text': 'This error is produced sometimes when building your docker image from the Amazon python base image.\\nSolution description: The following could solve the problem.\\nUpdate your docker desktop if you haven’t done so.\\nOr restart docker desktop and terminal and then build the image all over again.\\nOr if all else fails, first run the following command: DOCKER_BUILDKIT=0 docker build . then build your image.\\n(optional) Added by Odimegwu David', 'section': '9. Serverless Deep Learning', 'question': 'Getting ERROR [internal] load metadata for public.ecr.aws/lambda/python:3.8', 'course': 'machine-learning-zoomcamp', 'id': '0edeb016'}\n", "{'text': \"When trying to run the command !ls -lh in windows jupyter notebook , I was getting an error message that says “'ls' is not recognized as an internal or external command,operable program or batch file.\\nSolution description :\\nInstead of !ls -lh , you can use this command !dir , and you will get similar output\\nAsia Saeed\", 'section': '9. Serverless Deep Learning', 'question': \"Problem: 'ls' is not recognized as an internal or external command, operable program or batch file.\", 'course': 'machine-learning-zoomcamp', 'id': 'ba186de6'}\n", "{'text': 'When I run import tflite_runtime.interpreter as tflite , I get an error message says “ImportError: generic_type: type \"InterpreterWrapper\" is already registered!”\\nSolution description\\nThis error occurs when you import both tensorflow and tflite_runtime.interpreter “import tensorflow as tf” and “import tflite_runtime.interpreter as tflite” in the same notebook. To fix the issue, restart the kernel and import only tflite_runtime.interpreter \" import tflite_runtime.interpreter as tflite\".\\nAsia Saeed', 'section': '9. Serverless Deep Learning', 'question': 'ImportError: generic_type: type \"InterpreterWrapper\" is already registered!', 'course': 'machine-learning-zoomcamp', 'id': 'da2f1cf4'}\n", "{'text': 'Problem description:\\nIn command line try to do $ docker build -t dino_dragon\\ngot this Using default tag: latest\\n[2022-11-24T06:48:47.360149000Z][docker-credential-desktop][W] Windows version might not be up-to-date: The system cannot find the file specified.\\nerror during connect: This error may indicate that the docker daemon is not running.: Post\\n.\\nSolution description:\\nYou need to make sure that Docker is not stopped by a third-party program.\\nAndrei Ilin', 'section': '9. Serverless Deep Learning', 'question': 'Windows version might not be up-to-date', 'course': 'machine-learning-zoomcamp', 'id': '7fd648ca'}\n", "{'text': 'When running docker build -t dino-dragon-model it returns the above error\\nThe most common source of this error in this week is because Alex video shows a version of the wheel with python 8, we need to find a wheel with the version that we are working on. In this case python 9. Another common error is to copy the link, this will also produce the same error, we need to download the raw format:\\nhttps://github.com/alexeygrigorev/tflite-aws-lambda/raw/main/tflite/tflite_runtime-2.7.0-cp39-cp39-linux_x86_64.whl\\nPastor Soto', 'section': '9. Serverless Deep Learning', 'question': 'WARNING: You are using pip version 22.0.4; however, version 22.3.1 is available', 'course': 'machine-learning-zoomcamp', 'id': '42c09143'}\n", "{'text': 'Problem description:\\nIn video 9.6, after installing aswcli, we should configure it with aws configure . There it asks for Access Key ID, Secret Access Key, Default Region Name and also Default output format. What we should put for Default output format? Leaving it as None is okay?\\nSolution description:\\nYes, in my I case I left everything as the provided defaults (except, obviously, the Access key and the secret access key)\\nAdded by Bhaskar Sarma', 'section': '9. Serverless Deep Learning', 'question': 'How to do AWS configure after installing awscli', 'course': 'machine-learning-zoomcamp', 'id': 'd6d534fc'}\n", "{'text': 'Problem:\\nWhile passing local testing of the lambda function without issues, trying to test the same input with a running docker instance results in an error message like\\n{‘errorMessage’: ‘Unable to marshal response: Object of type float32 is not JSON serializable’, ‘errorType’: ‘Runtime.MarshalError’, ‘requestId’: ‘f155492c-9af2-4d04-b5a4-639548b7c7ac’, ‘stackTrace’: []}\\nThis happens when a model (in this case the dino vs dragon model) returns individual estimation values as numpy float32 values (arrays). They need to be converted individually to base-Python floats in order to become “serializable”.\\nSolution:\\nIn my particular case, I set up the dino vs dragon model in such a way as to return a label + predicted probability for each class as follows (below is a two-line extract of function predict() in the lambda_function.py):\\npreds = [interpreter.get_tensor(output_index)[0][0], \\\\\\n1-interpreter.get_tensor(output_index)[0][0]]\\nIn which case the above described solution will look like this:\\npreds = [float(interpreter.get_tensor(output_index)[0][0]), \\\\\\nfloat(1-interpreter.get_tensor(output_index)[0][0])]\\nThe rest can be made work by following the chapter 9 (and/or chapter 5!) lecture videos step by step.\\nAdded by Konrad Muehlberg', 'section': '9. Serverless Deep Learning', 'question': 'Object of type float32 is not JSON serializable', 'course': 'machine-learning-zoomcamp', 'id': 'b2c0c554'}\n", "{'text': 'I had this error when running the command line : interpreter.set_tensor(input_index, x) that can be seen in the video 9.3 around 12 minutes.\\nValueError: Cannot set tensor: Got value of type UINT8 but expected type FLOAT32 for input 0, name: serving_default_conv2d_input:0\\nThis is because the X is an int but a float is expected.\\nSolution:\\nI found this solution from this question here https://stackoverflow.com/questions/76102508/valueerror-cannot-set-tensor-got-value-of-type-float64-but-expected-type-float :\\n# Need to convert to float32 before set_tensor\\nX = np.float32(X)\\nThen, it works. I work with tensorflow 2.15.0, maybe the fact that this version is more recent involves this change ?\\nAdded by Mélanie Fouesnard', 'section': '9. Serverless Deep Learning', 'question': 'Error with the line “interpreter.set_tensor(input_index, X”)', 'course': 'machine-learning-zoomcamp', 'id': '819afebc'}\n", "{'text': 'To check your file size using the powershell terminal, you can do the following command lines:\\n$File = Get-Item -Path path_to_file\\n$FileSize = (Get-Item -Path $FilePath).Length\\nNow you can check the size of your file, for example in MB:\\nWrite-host \"MB\":($FileSize/1MB)\\nSource: https://www.sharepointdiary.com/2020/10/powershell-get-file-size.html#:~:text=To%20get%20the%20size%20of,the%20file%2C%20including%20its%20size.\\nAdded by Mélanie Fouesnard', 'section': '9. Serverless Deep Learning', 'question': 'How to easily get file size in powershell terminal ?', 'course': 'machine-learning-zoomcamp', 'id': '74551c54'}\n", "{'text': 'I wanted to understand how lambda container images work in depth and how lambda functions are initialized, for this reason, I found the following documentation\\nhttps://docs.aws.amazon.com/lambda/latest/dg/images-create.html\\nhttps://docs.aws.amazon.com/lambda/latest/dg/runtimes-api.html\\nAdded by Alejandro aponte', 'section': '9. Serverless Deep Learning', 'question': 'How do Lambda container images work?', 'course': 'machine-learning-zoomcamp', 'id': '4d98cd09'}\n", "{'text': 'The docker image for aws lambda can be created and pushed to aws ecr and the same can be exposed as a REST API through APIGatewayService in a single go using AWS Serverless Framework. Refer the below article for a detailed walkthrough.\\nhttps://medium.com/hoonio/deploy-containerized-serverless-flask-to-aws-lambda-c0eb87c1404d\\nAdded by Sumeet Lalla', 'section': '9. Serverless Deep Learning', 'question': 'How to use AWS Serverless Framework to deploy on AWS Lambda and expose it as REST API through APIGatewayService?', 'course': 'machine-learning-zoomcamp', 'id': '59a81fd5'}\n", "{'text': 'Problem:\\nWhile trying to build docker image in Section 9.5 with the command:\\ndocker build -t clothing-model .\\nIt throws a pip install error for the tflite runtime whl\\nERROR: failed to solve: process \"/bin/sh -c pip install https://github.com/alexeygrigorev/tflite-aws-lambda/blob/main/tflite/tflite_runtime-2.14.0-cp310-cp310-linux_x86_64.whl\" did not complete successfully: exit code: 1\\nTry to use this link: https://github.com/alexeygrigorev/tflite-aws-lambda/raw/main/tflite/tflite_runtime-2.14.0-cp310-cp310-linux_x86_64.whl\\nIf the link above does not work:\\nThe problem is because of the arm architecture of the M1. You will need to run the code on a PC or Ubuntu OS.\\nOr try the code bellow.\\nAdded by Dashel Ruiz Perez\\nSolution:\\nTo build the Docker image, use the command:\\ndocker build --platform linux/amd64 -t clothing-model .\\nTo run the built image, use the command:\\ndocker run -it --rm -p 8080:8080 --platform linux/amd64 clothing-model:latest\\nAdded by Daniel Egbo', 'section': '9. Serverless Deep Learning', 'question': 'Error building docker image on M1 Mac', 'course': 'machine-learning-zoomcamp', 'id': '35dbd6e2'}\n", "{'text': \"Problem: Trying to test API gateway in 9.7 - API Gateway: Exposing the Lambda Function, running: $ python test.py\\nWith error message:\\n{'message': 'Missing Authentication Token'}\\nSolution:\\nNeed to get the deployed API URL for the specific path you are invoking. Example:\\nhttps://.execute-api.us-east-2.amazonaws.com/test/predict\\nAdded by Andrew Katoch\", 'section': '9. Serverless Deep Learning', 'question': 'Error invoking API Gateway deploy API locally', 'course': 'machine-learning-zoomcamp', 'id': 'e5fe9efe'}\n", "{'text': 'Problem: When trying to install tflite_runtime with\\n!pip install --extra-index-url https://google-coral.github.io/py-repo/ tflite_runtime\\none gets an error message above.\\nSolution:\\nfflite_runtime is only available for the os-python version combinations that can be found here: https://google-coral.github.io/py-repo/tflite-runtime/\\nyour combination must be missing here\\nyou can see if any of these work for you https://github.com/alexeygrigorev/tflite-aws-lambda/tree/main/tflite\\nand install the needed one using pip\\neg\\npip install https://github.com/alexeygrigorev/tflite-aws-lambda/raw/main/tflite/tflite_runtime-2.7.0-cp38-cp38-linux_x86_64.whl\\nas it is done in the lectures code:\\nhttps://github.com/alexeygrigorev/mlbookcamp-code/blob/master/course-zoomcamp/09-serverless/code/Dockerfile#L4\\nAlternatively, use a virtual machine (with VM VirtualBox, for example) with a Linux system. The other way is to run a code at a virtual machine within cloud service, for example you can use Vertex AI Workbench at GCP (notebooks and terminals are provided there, so all tasks may be performed).\\nAdded by Alena Kniazeva, modified by Alex Litvinov', 'section': '9. Serverless Deep Learning', 'question': 'Error: Could not find a version that satisfies the requirement tflite_runtime (from versions:none)', 'course': 'machine-learning-zoomcamp', 'id': '5c043c62'}\n", "{'text': 'docker: Error response from daemon: mkdir /var/lib/docker/overlay2/37be849565da96ac3fce34ee9eb2215bd6cd7899a63ebc0ace481fd735c4cb0e-init: read-only file system.\\nYou need to restart the docker services to get rid of the above error\\nKrishna Anand', 'section': '9. Serverless Deep Learning', 'question': 'Docker run error', 'course': 'machine-learning-zoomcamp', 'id': 'af0739da'}\n", "{'text': 'The docker image can be saved/exported to tar format in local machine using the below command:\\ndocker image save -o \\nThe individual layers of the docker image for the filesystem content can be viewed by extracting the layer.tar present in the created from above.\\nSumeet Lalla', 'section': '9. Serverless Deep Learning', 'question': 'Save Docker Image to local machine and view contents', 'course': 'machine-learning-zoomcamp', 'id': '451bc25d'}\n", "{'text': 'On vscode running jupyter notebook. After I ‘pip install pillow’, my notebook did not recognize using the import for example from PIL import image. After restarting the jupyter notebook the imports worked.\\nQuinn Avila', 'section': '9. Serverless Deep Learning', 'question': 'Jupyter notebook not seeing package', 'course': 'machine-learning-zoomcamp', 'id': 'ea2e7458'}\n", "{'text': 'Due to experimenting back and forth so much without care for storage, I just ran out of it on my 30-GB AWS instance. It turns out that deleting docker images does not actually free up any space as you might expect. After removing images, you also need to run docker system prune', 'section': '9. Serverless Deep Learning', 'question': 'Running out of space for AWS instance.', 'course': 'machine-learning-zoomcamp', 'id': '6ce8e875'}\n", "{'text': 'Using the 2.14 version with python 3.11 works fine.\\nIn case it doesn’t work, I tried with tensorflow 2.4.4 whl, however, make sure to run it on top of supported python versions like 3.8, else there will be issues installing tf==2.4.4\\nAdded by Abhijit Chakraborty', 'section': '9. Serverless Deep Learning', 'question': 'Using Tensorflow 2.15 for AWS deployment', 'course': 'machine-learning-zoomcamp', 'id': 'b50e9e2b'}\n", "{'text': 'see here', 'section': '9. Serverless Deep Learning', 'question': 'Command aws ecr get-login --no-include-email returns “aws: error: argument operation: Invalid choice…”', 'course': 'machine-learning-zoomcamp', 'id': '29311ef5'}\n", "{'text': 'Sign in to the AWS Console: Log in to the AWS Console.\\nNavigate to IAM: Go to the IAM service by clicking on \"Services\" in the top left corner and selecting \"IAM\" under the \"Security, Identity, & Compliance\" section.\\nCreate a new policy: In the left navigation pane, select \"Policies\" and click on \"Create policy.\"\\nSelect the service and actions:\\nClick on \"JSON\" and copy and paste the JSON policy you provided earlier for the specific ECR actions.\\nReview and create the policy:\\nClick on \"Review policy.\"\\nProvide a name and description for the policy.\\nClick on \"Create policy.\"\\nJSON policy:\\n{\\n\"Version\": \"2012-10-17\",\\n\"Statement\": [\\n{\\n\"Sid\": \"VisualEditor0\",\\n\"Effect\": \"Allow\",\\n\"Action\": [\\n\"ecr:CreateRepository\",\\n\"ecr:GetAuthorizationToken\",\\n\"ecr:BatchCheckLayerAvailability\",\\n\"ecr:BatchGetImage\",\\n\"ecr:InitiateLayerUpload\",\\n\"ecr:UploadLayerPart\",\\n\"ecr:CompleteLayerUpload\",\\n\"ecr:PutImage\"\\n],\\n\"Resource\": \"*\"\\n}\\n]\\n}\\nAdded by: Daniel Muñoz-Viveros\\nERROR: failed to solve: public.ecr.aws/lambda/python:3.10: error getting credentials - err: exec: \"docker-credential-desktop.exe\": executable file not found in $PATH, out: ``\\n(WSL2 system)\\nSolved: Delete the file ~/.docker/config.json\\nYishan Zhan', 'section': '9. Serverless Deep Learning', 'question': 'What IAM permission policy is needed to complete Week 9: Serverless?', 'course': 'machine-learning-zoomcamp', 'id': '1e0dc11c'}\n", "{'text': 'Add the next lines to vim /etc/docker/daemon.json\\n{\\n\"dns\": [\"8.8.8.8\", \"8.8.4.4\"]\\n}\\nThen, restart docker: sudo service docker restart\\nIbai Irastorza', 'section': '9. Serverless Deep Learning', 'question': 'Docker Temporary failure in name resolution', 'course': 'machine-learning-zoomcamp', 'id': '1078aeb7'}\n", "{'text': \"Solution: add compile = False to the load_model function\\nkeras.models.load_model('model_name.h5', compile=False)\\nNadia Paz\", 'section': '9. Serverless Deep Learning', 'question': 'Keras model *.h5 doesn’t load. Error: weight_decay is not a valid argument, kwargs should be empty for `optimizer_experimental.Optimizer`', 'course': 'machine-learning-zoomcamp', 'id': '7daaca73'}\n", "{'text': 'This deployment setup can be tested locally using AWS RIE (runtime interface emulator).\\nBasically, if your Docker image was built upon base AWS Lambda image (FROM public.ecr.aws/lambda/python:3.10) - just use certain ports for “docker run” and a certain “localhost link” for testing:\\ndocker run -it --rm -p 9000:8080 name\\nThis command runs the image as a container and starts up an endpoint locally at:\\nlocalhost:9000/2015-03-31/functions/function/invocations\\nPost an event to the following endpoint using a curl command:\\ncurl -XPOST \"http://localhost:9000/2015-03-31/functions/function/invocations\" -d \\'{}\\'\\nExamples of curl testing:\\n* windows testing:\\ncurl -XPOST \"http://localhost:9000/2015-03-31/functions/function/invocations\" -d \"{\\\\\"url\\\\\": \\\\\"https://habrastorage.org/webt/rt/d9/dh/rtd9dhsmhwrdezeldzoqgijdg8a.jpeg\\\\\"}\"\\n* unix testing:\\ncurl -XPOST \"http://localhost:9000/2015-03-31/functions/function/invocations\" -d \\'{\"url\": \"https://habrastorage.org/webt/rt/d9/dh/rtd9dhsmhwrdezeldzoqgijdg8a.jpeg\"}\\'\\nIf during testing you encounter an error like this:\\n# {\"errorMessage\": \"Unable to marshal response: Object of type float32 is not JSON serializable\", \"errorType\": \"Runtime.MarshalError\", \"requestId\": \"7ea5d17a-e0a2-48d5-b747-a16fc530ed10\", \"stackTrace\": []}\\njust turn your response at lambda_handler() to string - str(result).\\nAdded by Andrii Larkin', 'section': '9. Serverless Deep Learning', 'question': 'How to test AWS Lambda + Docker locally?', 'course': 'machine-learning-zoomcamp', 'id': '0cfbe2e2'}\n", "{'text': 'Make sure all codes in test.py dont have any dependencies with tensorflow library. One of most common reason that lead the this error is tflite still imported from tensorflow. Change import tensorflow.lite as tflite to import tflite_runtime.interpreter as tflite\\nAdded by Ryan Pramana', 'section': '9. Serverless Deep Learning', 'question': '\"Unable to import module \\'lambda_function\\': No module named \\'tensorflow\\'\" when run python test.py', 'course': 'machine-learning-zoomcamp', 'id': '1460fb65'}\n", "{'text': 'I’ve tried to do everything in Google Colab. Here is a way to work with Docker in Google Colab:\\nhttps://gist.github.com/mwufi/6718b30761cd109f9aff04c5144eb885\\n\\uec03%%shell\\npip install udocker\\nudocker --allow-root install\\n\\uec02!udocker --allow-root run hello-world\\nAdded by Ivan Brigida\\nLambda API Gateway errors:\\n`Authorization header requires \\'Credential\\' parameter. Authorization header requires \\'Signature\\' parameter. Authorization header requires \\'SignedHeaders\\' parameter. Authorization header requires existence of either a \\'X-Amz-Date\\' or a \\'Date\\' header.`\\n`Missing Authentication Token`\\nimport boto3\\nclient = boto3.client(\\'apigateway\\')\\nresponse = client.test_invoke_method(\\nrestApiId=\\'your_rest_api_id\\',\\nresourceId=\\'your_resource_id\\',\\nhttpMethod=\\'POST\\',\\npathWithQueryString=\\'/test/predict\\', #depend how you set up the api\\nbody=\\'{\"url\": \"https://habrastorage.org/webt/rt/d9/dh/rtd9dhsmhwrdezeldzoqgijdg8a.jpeg\"}\\'\\n)\\nprint(response[\\'body\\'])\\nYishan Zhan\\nUnable to run pip install tflite_runtime from github wheel links?\\nTo overcome this issue, you can download the whl file to your local project folder and in the Docker file add the following lines:\\nCOPY .\\nRUN pip install \\nAbhijit Chakraborty', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Install Docker (udocker) in Google Colab', 'course': 'machine-learning-zoomcamp', 'id': 'd4f9efdc'}\n", "{'text': 'TODO', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'How to get started with Week 10?', 'course': 'machine-learning-zoomcamp', 'id': '6a417bfe'}\n", "{'text': 'Running a CNN on your CPU can take a long time and once you’ve run out of free time on some cloud providers, it’s time to pay up. Both can be tackled by installing tensorflow with CUDA support on your local machine if you have the right hardware.\\nI was able to get it working by using the following resources:\\nCUDA on WSL :: CUDA Toolkit Documentation (nvidia.com)\\nInstall TensorFlow with pip\\nStart Locally | PyTorch\\nI included the link to PyTorch so that you can get that one installed and working too while everything is fresh on your mind. Just select your options, and for Computer Platform, I chose CUDA 11.7 and it worked for me.\\nAdded by Martin Uribe', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'How to install Tensorflow in Ubuntu WSL2', 'course': 'machine-learning-zoomcamp', 'id': 'ed8b300d'}\n", "{'text': 'If you are running tensorflow on your own machine and you start getting the following errors:\\nAllocator (GPU_0_bfc) ran out of memory trying to allocate 6.88GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.\\nTry adding this code in a cell at the beginning of your notebook:\\nconfig = tf.compat.v1.ConfigProto()\\nconfig.gpu_options.allow_growth = True\\nsession = tf.compat.v1.Session(config=config)\\nAfter doing this most of my issues went away. I say most because there was one instance when I still got the error once more, but only during one epoch. I ran the code again, right after it finished, and I never saw the error again.\\nAdded by Martin Uribe', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Getting: Allocator ran out of memory errors?', 'course': 'machine-learning-zoomcamp', 'id': 'a64aed6b'}\n", "{'text': 'In session 10.3, when creating the virtual environment with pipenv and trying to run the script gateway.py, you might get this error:\\nTypeError: Descriptors cannot not be created directly.\\nIf this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.\\nIf you cannot immediately regenerate your protos, some other possible workarounds are:\\n1. Downgrade the protobuf package to 3.20.x or lower.\\n2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).\\nMore information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates\\nThis will happen if your version of protobuf is one of the newer ones. As a workaround, you can fix the protobuf version to an older one. In my case I got around the issue by creating the environment with:\\npipenv install --python 3.9.13 requests grpcio==1.42.0 flask gunicorn \\\\\\nkeras-image-helper tensorflow-protobuf==2.7.0 protobuf==3.19.6\\nAdded by Ángel de Vicente', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Problem with recent version of protobuf', 'course': 'machine-learning-zoomcamp', 'id': '727238ee'}\n", "{'text': 'Due to the uncertainties associated with machines, sometimes you can get the error message like this when you try to run a docker command:\\n”Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?”\\nSolution: The solution is simple. The Docker Desktop might no longer be connecting to the WSL Linux distro. What you need to do is go to your Docker Desktop setting and then click on resources. Under resources, click on WSL Integration. You will get a tab like the image below:\\nJust enable additional distros. That’s all. Even if the additional distro is the same as the default WSL distro.\\nOdimegwu David', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'WSL Cannot Connect To Docker Daemon', 'course': 'machine-learning-zoomcamp', 'id': '85d4901d'}\n", "{'text': 'In case the HPA instance does not run correctly even after installing the latest version of Metrics Server from the components.yaml manifest with:\\n>>kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml\\nAnd the targets still appear as \\nRun >>kubectl edit deploy -n kube-system metrics-server\\nAnd search for this line:\\nargs:\\n- --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname\\nAdd this line in the middle: - --kubelet-insecure-tls\\nSo that it stays like this:\\nargs:\\n- --kubelet-insecure-tls\\n- --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname\\nSave and run again >>kubectl get hpa\\nAdded by Marilina Orihuela', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'HPA instance doesn’t run properly', 'course': 'machine-learning-zoomcamp', 'id': 'df023a13'}\n", "{'text': 'In case the HPA instance does not run correctly even after installing the latest version of Metrics Server from the components.yaml manifest with:\\n>>kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml\\nAnd the targets still appear as \\nRun the following command:\\nkubectl apply -f https://raw.githubusercontent.com/Peco602/ml-zoomcamp/main/10-kubernetes/kube-config/metrics-server-deployment.yaml\\nWhich uses a metrics server deployment file already embedding the - --kubelet-insecure-tls option.\\nAdded by Giovanni Pecoraro', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'HPA instance doesn’t run properly (easier solution)', 'course': 'machine-learning-zoomcamp', 'id': '48e92d65'}\n", "{'text': \"When I run pip install grpcio==1.42.0 tensorflow-serving-api==2.7.0 to install the libraries in windows machine, I was getting the below error :\\nERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\\\\\\\Users\\\\\\\\Asia\\\\\\\\anaconda3\\\\\\\\Lib\\\\\\\\site-packages\\\\\\\\google\\\\\\\\protobuf\\\\\\\\internal\\\\\\\\_api_implementation.cp39-win_amd64.pyd'\\nConsider using the `--user` option or check the permissions.\\nSolution description :\\nI was able to install the libraries using below command:\\npip --user install grpcio==1.42.0 tensorflow-serving-api==2.7.0\\nAsia Saeed\", 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Could not install packages due to an OSError: [WinError 5] Access is denied', 'course': 'machine-learning-zoomcamp', 'id': '1685cae4'}\n", "{'text': 'Problem description\\nI was getting the below error message when I run gateway.py after modifying the code & creating virtual environment in video 10.3 :\\nFile \"C:\\\\Users\\\\Asia\\\\Data_Science_Code\\\\Zoompcamp\\\\Kubernetes\\\\gat.py\", line 9, in \\nfrom tensorflow_serving.apis import predict_pb2\\nFile \"C:\\\\Users\\\\Asia\\\\.virtualenvs\\\\Kubernetes-Ge6Ts1D5\\\\lib\\\\site-packages\\\\tensorflow_serving\\\\apis\\\\predict_pb2.py\", line 14, in \\nfrom tensorflow.core.framework import tensor_pb2 as tensorflow_dot_core_dot_framework_dot_tensor__pb2\\nFile \"C:\\\\Users\\\\Asia\\\\.virtualenvs\\\\Kubernetes-Ge6Ts1D5\\\\lib\\\\site-packages\\\\tensorflow\\\\core\\\\framework\\\\tensor_pb2.py\", line 14, in \\nfrom tensorflow.core.framework import resource_handle_pb2 as tensorflow_dot_core_dot_framework_dot_resource__handle__pb2\\nFile \"C:\\\\Users\\\\Asia\\\\.virtualenvs\\\\Kubernetes-Ge6Ts1D5\\\\lib\\\\site-packages\\\\tensorflow\\\\core\\\\framework\\\\resource_handle_pb2.py\", line 14, in \\nfrom tensorflow.core.framework import tensor_shape_pb2 as tensorflow_dot_core_dot_framework_dot_tensor__shape__pb2\\nFile \"C:\\\\Users\\\\Asia\\\\.virtualenvs\\\\Kubernetes-Ge6Ts1D5\\\\lib\\\\site-packages\\\\tensorflow\\\\core\\\\framework\\\\tensor_shape_pb2.py\", line 36, in \\n_descriptor.FieldDescriptor(\\nFile \"C:\\\\Users\\\\Asia\\\\.virtualenvs\\\\Kubernetes-Ge6Ts1D5\\\\lib\\\\site-packages\\\\google\\\\protobuf\\\\descriptor.py\", line 560, in __new__\\n_message.Message._CheckCalledFromGeneratedFile()\\nTypeError: Descriptors cannot not be created directly.\\nIf this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.\\nIf you cannot immediately regenerate your protos, some other possible workarounds are:\\n1. Downgrade the protobuf package to 3.20.x or lower.\\n2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).\\nSolution description:\\nIssue has been resolved by downgrading protobuf to version 3.20.1.\\npipenv install protobuf==3.20.1\\nAsia Saeed', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'TypeError: Descriptors cannot not be created directly.', 'course': 'machine-learning-zoomcamp', 'id': '4fb7b21e'}\n", "{'text': 'To install kubectl on windows using the terminal in vscode (powershell), I followed this tutorial: https://medium.com/@ggauravsigra/install-kubectl-on-windows-af77da2e6fff\\nI first downloaded kubectl with curl, with these command lines: https://kubernetes.io/docs/tasks/tools/install-kubectl-windows/#install-kubectl-binary-with-curl-on-windows\\nAt step 3, I followed the tutorial with the copy of the exe file in a specific folder on C drive.\\nThen I added this folder path to PATH in my environment variables.\\nKind can be installed the same way with the curl command on windows, by specifying a folder that will be added to the path environment variable.\\nAdded by Mélanie Fouesnard', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'How to install easily kubectl on windows ?', 'course': 'machine-learning-zoomcamp', 'id': '8bd3bfc2'}\n", "{'text': \"First you need to launch a powershell terminal with administrator privilege.\\nFor this we need to install choco library first through the following syntax in powershell:\\nSet-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))\\nKrishna Anand\", 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Install kind through choco library', 'course': 'machine-learning-zoomcamp', 'id': '03b5fc59'}\n", "{'text': 'If you are having challenges installing Kind through the Windows Powershell as provided on the website and Choco Library as I did, you can simply install Kind through Go.\\n> Download and Install Go (https://go.dev/doc/install)\\n> Confirm installation by typing the following in Command Prompt - go version\\n> Proceed by installing Kind by following this command - go install sigs.k8s.io/kind@v0.20.0\\n>Confirm Installation kind --version\\nIt works perfectly.', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Install Kind via Go package', 'course': 'machine-learning-zoomcamp', 'id': '7c31bc9a'}\n", "{'text': \"I ran into an issue where kubectl wasn't working.\\nI kept getting the following error:\\nkubectl get service\\nThe connection to the server localhost:8080 was refused - did you specify the right host or port?\\nI searched online for a resolution, but everyone kept talking about creating an environment variable and creating some admin.config file in my home directory.\\nAll hogwash.\\nThe solution to my problem was to just start over.\\nkind delete cluster\\nrm -rf ~/.kube\\nkind create cluster\\nNow when I try the same command again:\\nkubectl get service\\nNAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE\\nkubernetes ClusterIP 10.96.0.1 443/TCP 53s\\nAdded by Martin Uribe\", 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'The connection to the server localhost:8080 was refused - did you specify the right host or port?', 'course': 'machine-learning-zoomcamp', 'id': '605efc12'}\n", "{'text': 'Problem description\\nDue to experimenting back and forth so much without care for storage, I just ran out of it on my 30-GB AWS instance.\\nMy first reflex was to remove some zoomcamp directories, but of course those are mostly code so it didn’t help much.\\nSolution description\\n> docker images\\nrevealed that I had over 20 GBs worth of superseded / duplicate models lying around, so I proceeded to > docker rmi\\na bunch of those — but to no avail!\\nIt turns out that deleting docker images does not actually free up any space as you might expect. After removing images, you also need to run\\n> docker system prune\\nSee also: https://stackoverflow.com/questions/36799718/why-removing-docker-containers-and-images-does-not-free-up-storage-space-on-wind\\nAdded by Konrad Mühlberg', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Running out of storage after building many docker images', 'course': 'machine-learning-zoomcamp', 'id': 'c5cde96c'}\n", "{'text': 'Yes, the question does require for you to specify values for CPU and memory in the yaml file, however the question that it is use in the form only refers to the port which do have a define correct value for this specific homework.\\nPastor Soto', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'In HW10 Q6 what does it mean “correct value for CPU and memory”? Aren’t they arbitrary?', 'course': 'machine-learning-zoomcamp', 'id': 'd45d2da6'}\n", "{'text': 'In Kubernetes resource specifications, such as CPU requests and limits, the \"m\" stands for milliCPU, which is a unit of computing power. It represents one thousandth of a CPU core.\\ncpu: \"100m\" means the container is requesting 100 milliCPUs, which is equivalent to 0.1 CPU core.\\ncpu: \"500m\" means the container has a CPU limit of 500 milliCPUs, which is equivalent to 0.5 CPU core.\\nThese values are specified in milliCPUs to allow fine-grained control over CPU resources. It allows you to express CPU requirements and limits in a more granular way, especially in scenarios where your application might not need a full CPU core.\\nAdded by Andrii Larkin', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Why cpu vals for Kubernetes deployment.yaml look like “100m” and “500m”? What does \"m\" mean?', 'course': 'machine-learning-zoomcamp', 'id': '59823c72'}\n", "{'text': 'Problem: Failing to load docker-image to cluster (when you’ved named a cluster)\\nkind load docker-image zoomcamp-10-model:xception-v4-001\\nERROR: no nodes found for cluster \"kind\"\\nSolution: Specify cluster name with -n\\nkind -n clothing-model load docker-image zoomcamp-10-model:xception-v4-001\\nAndrew Katoch', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Kind cannot load docker image', 'course': 'machine-learning-zoomcamp', 'id': '665f7b27'}\n", "{'text': \"Problem: I download kind from the next command:\\ncurl.exe -Lo kind-windows-amd64.exe https://kind.sigs.k8s.io/dl/v0.17.0/kind-windows-amd64\\nWhen I try\\nkind --version\\nI get: 'kind' is not recognized as an internal or external command, operable program or batch file\\nSolution: The default name of executable is kind-windows-amd64.exe, so that you have to rename this file to kind.exe. Put this file in specific folder, and add it to PATH\\nAlejandro Aponte\", 'section': '10. Kubernetes and TensorFlow Serving', 'question': \"'kind' is not recognized as an internal or external command, operable program or batch file. (In Windows)\", 'course': 'machine-learning-zoomcamp', 'id': '0a406fe0'}\n", "{'text': 'Using kind with Rootless Docker or Rootless Podman requires some changes on the system (Linux), see kind – Rootless (k8s.io).\\nSylvia Schmitt', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Running kind on Linux with Rootless Docker or Rootless Podman', 'course': 'machine-learning-zoomcamp', 'id': '64b209b0'}\n", "{'text': 'Deploy and Access the Kubernetes Dashboard\\nLuke', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Kubernetes-dashboard', 'course': 'machine-learning-zoomcamp', 'id': '518c4cb8'}\n", "{'text': 'Make sure you are on AWS CLI v2 (check with aws --version)\\nhttps://docs.aws.amazon.com/cli/latest/userguide/cliv2-migration-instructions.html', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Correct AWS CLI version for eksctl', 'course': 'machine-learning-zoomcamp', 'id': '00882c83'}\n", "{'text': 'Problem Description:\\nIn video 10.3, when I was testing a flask service, I got the above error. I ran docker run .. in one terminal. When in second terminal I run python gateway.py, I get the above error.\\nSolution: This error has something to do with versions of Flask and Werkzeug. I got the same error, if I just import flask with from flask import Flask.\\nBy running pip freeze > requirements.txt,I found that their versions are Flask==2.2.2 and Werkzeug==2.2.2. This error appears while using an old version of werkzeug (2.2.2) with new version of flask (2.2.2). I solved it by pinning version of Flask into an older version with pipenv install Flask==2.1.3.\\nAdded by Bhaskar Sarma', 'section': '10. Kubernetes and TensorFlow Serving', 'question': \"TypeError: __init__() got an unexpected keyword argument 'unbound_message' while importing Flask\", 'course': 'machine-learning-zoomcamp', 'id': 'd6d483ce'}\n", "{'text': 'As per AWS documentation:\\nhttps://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html\\nYou need to do: (change the fields in red)\\naws ecr get-login-password --region region | docker login --username AWS --password-stdin aws_account_id.dkr.ecr.region.amazonaws.com\\nAlternatively you can run the following command without changing anything given you have a default region configured\\naws ecr get-login-password --region $(aws configure get region) | docker login --username AWS --password-stdin \"$(aws sts get-caller-identity --query \"Account\" --output text).dkr.ecr.$(aws configure get region).amazonaws.com\"\\nAdded by Humberto Rodriguez', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Command aws ecr get-login --no-include-email returns “aws: error: argument operation: Invalid choice…”', 'course': 'machine-learning-zoomcamp', 'id': 'f9711723'}\n", "{'text': 'While trying to run the docker code on M1:\\ndocker run --platform linux/amd64 -it --rm \\\\\\n-p 8500:8500 \\\\\\n-v $(pwd)/clothing-model:/models/clothing-model/1 \\\\\\n-e MODEL_NAME=\"clothing-model\" \\\\\\ntensorflow/serving:2.7.0\\nIt outputs the error:\\nError:\\nStatus: Downloaded newer image for tensorflow/serving:2.7.0\\n[libprotobuf FATAL external/com_google_protobuf/src/google/protobuf/generated_message_reflection.cc:2345] CHECK failed: file != nullptr:\\nterminate called after throwing an instance of \\'google::protobuf::FatalException\\'\\nwhat(): CHECK failed: file != nullptr:\\nqemu: uncaught target signal 6 (Aborted) - core dumped\\n/usr/bin/tf_serving_entrypoint.sh: line 3: 8 Aborted tensorflow_model_server --port=8500 --rest_api_port=8501 --model_name=${MODEL_NAME} --model_base_path=${MODEL_BASE_PATH}/${MODEL_NAME} \"$@\"\\nSolution\\ndocker pull emacski/tensorflow-serving:latest\\ndocker run -it --rm \\\\\\n-p 8500:8500 \\\\\\n-v $(pwd)/clothing-model:/models/clothing-model/1 \\\\\\n-e MODEL_NAME=\"clothing-model\" \\\\\\nemacski/tensorflow-serving:latest-linux_arm64\\nSee more here: https://github.com/emacski/tensorflow-serving-arm\\nAdded by Daniel Egbo', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Error downloading tensorflow/serving:2.7.0 on Apple M1 Mac', 'course': 'machine-learning-zoomcamp', 'id': '5bda3b94'}\n", "{'text': 'Similar to the one above but with a different solution the main reason is that emacski doesn’t seem to maintain the repo any more, the latest image is from 2 years ago at the time of writing (December 2023)\\nProblem:\\nWhile trying to run the docker code on Mac M2 apple silicon:\\ndocker run --platform linux/amd64 -it --rm \\\\\\n-p 8500:8500 \\\\\\n-v $(pwd)/clothing-model:/models/clothing-model/1 \\\\\\n-e MODEL_NAME=\"clothing-model\" \\\\\\ntensorflow/serving\\nYou get an error:\\n/usr/bin/tf_serving_entrypoint.sh: line 3: 7 Illegal instruction tensorflow_model_server --port=8500 --rest_api_port=8501 --model_name=${MODEL_NAME} --model_base_path=${MODEL_BASE_PATH}/${MODEL_NAME} \"$@\"\\nSolution:\\nUse bitnami/tensorflow-serving base image\\nLaunch it either using docker run\\ndocker run -d \\\\\\n--name tf_serving \\\\\\n-p 8500:8500 \\\\\\n-p 8501:8501 \\\\\\n-v $(pwd)/clothing-model:/bitnami/model-data/1 \\\\\\n-e TENSORFLOW_SERVING_MODEL_NAME=clothing-model \\\\\\nbitnami/tensorflow-serving:2\\nOr the following docker-compose.yaml\\nversion: \\'3\\'\\nservices:\\ntf_serving:\\nimage: bitnami/tensorflow-serving:2\\nvolumes:\\n- ${PWD}/clothing-model:/bitnami/model-data/1\\nports:\\n- 8500:8500\\n- 8501:8501\\nenvironment:\\n- TENSORFLOW_SERVING_MODEL_NAME=clothing-model\\nAnd run it with\\ndocker compose up\\nAdded by Alex Litvinov', 'section': '10. Kubernetes and TensorFlow Serving', 'question': 'Illegal instruction error when running tensorflow/serving image on Mac M2 Apple Silicon (potentially on M1 as well)', 'course': 'machine-learning-zoomcamp', 'id': 'cccd31cf'}\n", "{'text': 'Problem: CPU metrics Shows Unknown\\nNAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS AGE\\ncredit-hpa Deployment/credit /20% 1 3 1 18s\\nFailedGetResourceMetric 2m15s (x169 over 44m) horizontal-pod-autoscaler failed to get cpu utilization: unable to get metrics for resource cpu: unable to fetch metrics from resource metrics API:\\nSolution:\\n-> Delete HPA (kubectl delete hpa credit-hpa)\\n-> kubectl apply -f https://raw.githubusercontent.com/pythianarora/total-practice/master/sample-kubernetes-code/metrics-server.yaml\\n-> Create HPA\\nThis should solve the cpu metrics report issue.\\nAdded by Priya V', 'section': '11. KServe', 'question': 'HPA doesn’t show CPU metrics', 'course': 'machine-learning-zoomcamp', 'id': '57f49999'}\n", "{'text': 'Problem description:\\nRunning this:\\ncurl -s \"https://raw.githubusercontent.com/kserve/kserve/release-0.9/hack/quick_install.sh\" | bash\\nFails with errors because of istio failing to update resources, and you are on kubectl > 1.25.0.\\nCheck kubectl version with kubectl version\\nSolution description\\nEdit the file “quick_install.bash” by downloading it with curl without running bash. Edit the versions of Istio and Knative as per the matrix on the KServe website.\\nRun the bash script now.\\nAdded by Andrew Katoch', 'section': '11. KServe', 'question': 'Errors with istio during installation', 'course': 'machine-learning-zoomcamp', 'id': '5cb58698'}\n", "{'text': 'Problem description\\nSolution description\\n(optional) Added by Name', 'section': 'Projects (Midterm and Capstone)', 'question': 'Problem title', 'course': 'machine-learning-zoomcamp', 'id': 'de650b41'}\n", "{'text': 'Answer: You can see them here (it’s taken from the 2022 cohort page). Go to the cohort folder for your own cohort’s deadline.', 'section': 'Projects (Midterm and Capstone)', 'question': 'What are the project deadlines?', 'course': 'machine-learning-zoomcamp', 'id': '9ffacaac'}\n", "{'text': 'Answer: All midterms and capstones are meant to be solo projects. [source @Alexey]', 'section': 'Projects (Midterm and Capstone)', 'question': 'Are projects solo or collaborative/group work?', 'course': 'machine-learning-zoomcamp', 'id': '4dfb5d4f'}\n", "{'text': 'Answer: Ideally midterms up to module-06, capstones include all modules in that cohort’s syllabus. But you can include anything extra that you want to feature. Just be sure to document anything not covered in class.\\nAlso watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\\nMore discussions:\\n[source1] [source2] [source3]', 'section': 'Projects (Midterm and Capstone)', 'question': 'What modules, topics, problem-sets should a midterm/capstone project cover? Can I do xyz?', 'course': 'machine-learning-zoomcamp', 'id': '0b8739b7'}\n", "{'text': \"These links apply to all projects, actually. Again, for some cohorts, the modules/syllabus might be different, so always check in your cohort’s folder as well for additional or different instructions, if any.\\nMidterm Project Sample: https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/course-zoomcamp/cohorts/2021/07-midterm-project\\nMidTerm Project Deliverables: https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/course-zoomcamp/projects\\nSubmit MidTerm Project: https://docs.google.com/forms/d/e/1FAIpQLSfgmOk0QrmHu5t0H6Ri1Wy_FDVS8I_nr5lY3sufkgk18I6S5A/viewform\\nDatasets:\\nhttps://www.kaggle.com/datasets and https://www.kaggle.com/competitions\\nhttps://archive.ics.uci.edu/ml/index.php\\nhttps://data.europa.eu/en\\nhttps://www.openml.org/search?type=data\\nhttps://newzealand.ai/public-data-sets\\nhttps://datasetsearch.research.google.com\\nWhat to do and Deliverables\\nThink of a problem that's interesting for you and find a dataset for that\\nDescribe this problem and explain how a model could be used\\nPrepare the data and doing EDA, analyze important features\\nTrain multiple models, tune their performance and select the best model\\nExport the notebook into a script\\nPut your model into a web service and deploy it locally with Docker\\nBonus points for deploying the service to the cloud\", 'section': 'Projects (Midterm and Capstone)', 'question': 'Crucial Links', 'course': 'machine-learning-zoomcamp', 'id': '9eb52679'}\n", "{'text': 'Answer: Previous cohorts projects page has instructions (youtube).\\nhttps://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2022/projects.md#midterm-project\\nAlexey and his team will compile a g-sheet with links to submitted projects with our hashed emails (just like when we check leaderboard for homework) that are ours to review within the evaluation deadline.\\n~~~ Added by Nukta Bhatia ~~~', 'section': 'Projects (Midterm and Capstone)', 'question': 'How to conduct peer reviews for projects?', 'course': 'machine-learning-zoomcamp', 'id': '7a1fcfd9'}\n", "{'text': 'See the answer here.', 'section': 'Projects (Midterm and Capstone)', 'question': 'Computing the hash for project review', 'course': 'machine-learning-zoomcamp', 'id': '1cfa62c5'}\n", "{'text': 'For the learning in public for this midterm project it seems that has a total value of 14!, Does this mean that we need make 14 posts?, Or the regular seven posts for each module and each one with a value of 2?, Or just one with a total value of 14?\\n14 posts, one for each day', 'section': 'Projects (Midterm and Capstone)', 'question': 'Learning in public links for the projects', 'course': 'machine-learning-zoomcamp', 'id': '2a78f52e'}\n", "{'text': 'You can use git-lfs (https://git-lfs.com/) for upload large file to github repository.\\nRyan Pramana', 'section': 'Projects (Midterm and Capstone)', 'question': \"My dataset is too large and I can't loaded in GitHub , do anyone knows about a solution?\", 'course': 'machine-learning-zoomcamp', 'id': '68aeab64'}\n", "{'text': 'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\\n(optional) David Odimegwu', 'section': 'Projects (Midterm and Capstone)', 'question': 'What If I submitted only two projects and failed to submit the third?', 'course': 'machine-learning-zoomcamp', 'id': '9a7c26e0'}\n", "{'text': 'Yes. You only need to review peers when you submit your project.\\nConfirmed on Slack by Alexey Grigorev (added by Rileen Sinha)', 'section': 'Projects (Midterm and Capstone)', 'question': \"I did the first two projects and skipped the last one so I wouldn't have two peer review in second capstone right?\", 'course': 'machine-learning-zoomcamp', 'id': '1fd83eb9'}\n", "{'text': 'Regarding Point 4 in the midterm deliverables, which states, \"Train multiple models, tune their performance, and select the best model,\" you might wonder, how many models should you train? The answer is simple: train as many as you can. The term \"multiple\" implies having more than one model, so as long as you have more than one, you\\'re on the right track.', 'section': 'Projects (Midterm and Capstone)', 'question': 'How many models should I train?', 'course': 'machine-learning-zoomcamp', 'id': 'fbaa5b20'}\n", "{'text': 'I am not sure how the project evaluate assignment works? Where do I find this? I have access to all the capstone 2 project, perhaps, I can randomly pick any to review.\\nAnswer:\\nThe link provided for example (2023/Capstone link ): https://docs.google.com/forms/d/e/1FAIpQLSdgoepohpgbM4MWTAHWuXa6r3NXKnxKcg4NDOm0bElAdXdnnA/viewform contains a list of all submitted projects to be evaluated. More specific, you are to review 3 assigned peer projects. In the spreadsheet are 3 hash values of your assigned peer projects. However, you need to derive the your hash value of your email address and find the value on the spreadsheet under the (reviewer_hash) heading.\\nTo calculate your hash value run the python code below:\\nfrom hashlib import sha1\\ndef compute_hash(email):\\nreturn sha1(email.lower().encode(\\'utf-8\\')).hexdigest()\\n# Example usage **** enter your email below (Example1@gmail.com)****\\nemail = \"Example1@gmail.com\"\\nhashed_email = compute_hash(email)\\nprint(\"Original Email:\", email)\\nprint(\"Hashed Email (SHA-1):\", hashed_email)\\nEdit the above code to replace Example1@gmail.com as your email address\\nStore and run the above python code from your terminal. See below as the Hashed Email (SHA-1) value\\nYou then go to the link: https://docs.google.com/spreadsheets/d/e/2PACX-1vR-7RRtq7AMx5OzI-tDbkzsbxNLm-NvFOP5OfJmhCek9oYcDx5jzxtZW2ZqWvBqc395UZpHBv1of9R1/pubhtml?gid=876309294&single=true\\nLastly, copy the “Hashed Email (SHA-1): bd9770be022dede87419068aa1acd7a2ab441675” value and search for 3 identical entries. There you should see your peer project to be reviewed.\\nBy Emmanuel Ayeni', 'section': 'Projects (Midterm and Capstone)', 'question': 'How does the project evaluation work for you as a peer reviewer?', 'course': 'machine-learning-zoomcamp', 'id': '37eab341'}\n", "{'text': 'Alexey Grigorev: “It’s based on all the scores to make sure most of you pass.” By Annaliese Bronz\\nOther course-related questions that don’t fall into any of the categories above or can apply to more than one category/module', 'section': 'Miscellaneous', 'question': 'Do you pass a project based on the average of everyone else’s scores or based on the total score you earn?', 'course': 'machine-learning-zoomcamp', 'id': '57754faf'}\n", "{'text': 'Answer: The train.py file will be used by your peers to review your midterm project. It is for them to cross-check that your training process works on someone else’s system. It should also be included in the environment in conda or with pipenv.\\nOdimegwu David', 'section': 'Miscellaneous', 'question': 'Why do I need to provide a train.py file when I already have the notebook.ipynb file?', 'course': 'machine-learning-zoomcamp', 'id': '6979c5d1'}\n", "{'text': \"Pip install pillow - install pillow library\\nfrom PIL import Image\\nimg = Image.open('aeroplane.png')\\nFrom numpy import asarray\\nnumdata=asarray(img)\\nKrishna Anand\", 'section': 'Miscellaneous', 'question': 'Loading the Image with PILLOW library and converting to numpy array', 'course': 'machine-learning-zoomcamp', 'id': 'a1bd8c34'}\n", "{'text': \"Ans: train.py has to be a python file. This is because running a python script for training a model is much simpler than running a notebook and that's how training jobs usually look like in real life.\", 'section': 'Miscellaneous', 'question': 'Is a train.py file necessary when you have a train.ipynb file in your midterm project directory?', 'course': 'machine-learning-zoomcamp', 'id': 'b2ab0fc1'}\n", "{'text': 'Yes, you can create a mobile app or interface that manages these forms and validations. But you should also perform validations on backend.\\nYou can also check Streamlit: https://github.com/DataTalksClub/project-of-the-week/blob/main/2022-08-14-frontend.md\\nAlejandro Aponte', 'section': 'Miscellaneous', 'question': 'Is there a way to serve up a form for users to enter data for the model to crunch on?', 'course': 'machine-learning-zoomcamp', 'id': '80c439a9'}\n", "{'text': \"Using model.feature_importances_ can gives you an error:\\nAttributeError: 'Booster' object has no attribute 'feature_importances_'\\nAnswer: if you train the model like this: model = xgb.train you should use get_score() instead\\nEkaterina Kutovaia\", 'section': 'Miscellaneous', 'question': 'How to get feature importance for XGboost model', 'course': 'machine-learning-zoomcamp', 'id': 'ff93b86e'}\n", "{'text': 'In the Elastic Container Service task log, error “[Errno 12] Cannot allocate memory” showed up.\\nJust increase the RAM and CPU in your task definition.\\nHumberto Rodriguez', 'section': 'Miscellaneous', 'question': '[Errno 12] Cannot allocate memory in AWS Elastic Container Service', 'course': 'machine-learning-zoomcamp', 'id': 'fcd86c8f'}\n", "{'text': \"When running a docker container with waitress serving the app.py for making predictions, pickle will throw an error that can't get attribute on module __main__.\\nThis does not happen when Flask is used directly, i.e. not through waitress.\\nThe problem is that the model uses a custom column transformer class, and when the model was saved, it was saved from the __main__ module (e.g. python train.py). Pickle will reference the class in the global namespace (top-level code): __main__..\\nWhen using waitress, waitress will load the predict_app module and this will call pickle.load, that will try to find __main__. that does not exist.\\nSolution:\\nPut the class into a separate module and import it in both the script that saves the model (e.g. train.py) and the script that loads the model (e.g. predict.py)\\nNote: If Flask is used (no waitress) in predict.py, and predict.py has the definition of the class, When it is run: python predict.py, it will work because the class is in the same namespace as the one used when the model was saved (__main__).\\nDetailed info: https://stackoverflow.com/questions/27732354/unable-to-load-files-using-pickle-and-multiple-modules\\nMarcos MJD\", 'section': 'Miscellaneous', 'question': 'Pickle error: can’t get attribute XXX on module __main__', 'course': 'machine-learning-zoomcamp', 'id': '236864c2'}\n", "{'text': 'There are different techniques, but the most common used are the next:\\nDataset transformation (for example, log transformation)\\nClipping high values\\nDropping these observations\\nAlena Kniazeva', 'section': 'Miscellaneous', 'question': 'How to handle outliers in a dataset?', 'course': 'machine-learning-zoomcamp', 'id': 'efc4a04f'}\n", "{'text': 'I was getting the below error message when I was trying to create docker image using bentoml\\n[bentoml-cli] `serve` failed: Failed loading Bento from directory /home/bentoml/bento: Failed to import module \"service\": No module named \\'sklearn\\'\\nSolution description\\nThe cause was because , in bentofile.yaml, I wrote sklearn instead of scikit-learn. Issue was fixed after I modified the packages list as below.\\npackages: # Additional pip packages required by the service\\n- xgboost\\n- scikit-learn\\n- pydantic\\nAsia Saeed', 'section': 'Miscellaneous', 'question': 'Failed loading Bento from directory /home/bentoml/bento: Failed to import module \"service\": No module named \\'sklearn\\'', 'course': 'machine-learning-zoomcamp', 'id': '15f361b7'}\n", "{'text': \"You might see a long error message with something about sparse matrices, and in the swagger UI, you get a code 500 error with “” (empty string) as output.\\nPotential reason: Setting DictVectorizer or OHE to sparse while training, and then storing this in a pipeline or custom object in the benotml model saving stage in train.py. This means that when the custom object is called in service.py, it will convert each input to a different sized sparse matrix, and this can't be batched due to inconsistent length. In this case, bentoml model signatures should have batchable set to False for production during saving the bentoml mode in train.py.\\n(Memoona Tahira)\", 'section': 'Miscellaneous', 'question': 'BentoML not working with –production flag at any stage: e.g. with bentoml serve and while running the bentoml container', 'course': 'machine-learning-zoomcamp', 'id': 'dbbce78b'}\n", "{'text': 'Problem description:\\nDo we have to run everything?\\nYou are encouraged, if you can, to run them. As this provides another opportunity to learn from others.\\nNot everyone will be able to run all the files, in particular the neural networks.\\nSolution description:\\nAlternatively, can you see that everything you need to reproduce is there: the dataset is there, the instructions are there, are there any obvious errors and so on.\\nRelated slack conversation here.\\n(Gregory Morris)', 'section': 'Miscellaneous', 'question': 'Reproducibility', 'course': 'machine-learning-zoomcamp', 'id': 'f3a00e15'}\n", "{'text': \"If your model is too big for github one option is to try and compress the model using joblib. For example joblib.dump(model, model_filename, compress=('zlib', 6) will use zlib to compress the model. Just note this could take a few moments as the model is being compressed.\\nQuinn Avila\", 'section': 'Miscellaneous', 'question': 'Model too big', 'course': 'machine-learning-zoomcamp', 'id': '9102b3c0'}\n", "{'text': \"When you try to push the docker image to Google Container Registry and get this message “unauthorized: You don't have the needed permissions to perform this operation, and you may have invalid credentials.”, type this below on console, but first install https://cloud.google.com/sdk/docs/install, this is to be able to use gcloud in console:\\ngcloud auth configure-docker\\n(Jesus Acuña)\", 'section': 'Miscellaneous', 'question': 'Permissions to push docker to Google Container Registry', 'course': 'machine-learning-zoomcamp', 'id': '70d89fdf'}\n", "{'text': 'I am getting this error message when I tried to install tflite in a pipenv environment\\nError: An error occurred while installing tflite_runtime!\\nError text:\\nERROR: Could not find a version that satisfies the requirement tflite_runtime (from versions: none)\\nERROR: No matching distribution found for tflite_runtime\\nThis version of tflite do not run on python 3.10, the way we can make it work is by install python 3.9, after that it would install the tflite_runtime without problem.\\nPastor Soto\\nCheck all available versions here:\\nhttps://google-coral.github.io/py-repo/tflite-runtime/\\nIf you don’t find a combination matching your setup, try out the options at\\nhttps://github.com/alexeygrigorev/tflite-aws-lambda/tree/main/tflite\\nwhich you can install as shown in the lecture, e.g.\\npip install https://github.com/alexeygrigorev/tflite-aws-lambda/raw/main/tflite/tflite_runtime-2.7.0-cp38-cp38-linux_x86_64.whl\\nFinally, if nothing works, use the TFLite included in TensorFlow for local development, and use Docker for testing Lambda.\\nRileen Sinha (based on discussions on Slack)', 'section': 'Miscellaneous', 'question': 'Tflite_runtime unable to install', 'course': 'machine-learning-zoomcamp', 'id': 'c5d6a804'}\n", "{'text': \"Error: ImageDataGenerator name 'scipy' is not defined.\\nCheck that scipy is installed in your environment.\\nRestart jupyter kernel and try again.\\nMarcos MJD\", 'section': 'Miscellaneous', 'question': 'Error when running ImageDataGenerator.flow_from_dataframe', 'course': 'machine-learning-zoomcamp', 'id': '8c7f089f'}\n", "{'text': 'Tim from BentoML has prepared a dedicated video tutorial wrt this use case here:\\nhttps://www.youtube.com/watch?v=7gI1UH31xb4&list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR&index=97\\nKonrad Muehlberg', 'section': 'Miscellaneous', 'question': 'How to pass BentoML content / docker container to Amazon Lambda', 'course': 'machine-learning-zoomcamp', 'id': '739bcccf'}\n", "{'text': \"In deploying model part, I wanted to test my model locally on a test-image data and I had this silly error after the following command:\\nurl = 'https://github.com/bhasarma/kitchenware-classification-project/blob/main/test-image.jpg'\\nX = preprocessor.from_url(url)\\nI got the error:\\nUnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7f797010a590>\\nSolution:\\nAdd ?raw=true after .jpg in url. E.g. as below\\nurl = ‘https://github.com/bhasarma/kitchenware-classification-project/blob/main/test-image.jpg?raw=true’\\nBhaskar Sarma\", 'section': 'Miscellaneous', 'question': 'Error UnidentifiedImageError: cannot identify image file', 'course': 'machine-learning-zoomcamp', 'id': '4603e4e5'}\n", "{'text': 'Problem: If you run pipenv install and get this message. Maybe manually change Pipfile and Pipfile.lock.\\nSolution: Run: ` pipenv lock` for fix this problem and dependency files\\nAlejandro Aponte', 'section': 'Miscellaneous', 'question': '[pipenv.exceptions.ResolutionFailure]: Warning: Your dependencies could not be resolved. You likely have a mismatch in your sub-dependencies', 'course': 'machine-learning-zoomcamp', 'id': '0a7c328e'}\n", "{'text': 'Problem: In the course this function worked to get the features from the dictVectorizer instance: dv.get_feature_names(). But in my computer did not work. I think it has to do with library versions and but apparently that function will be deprecated soon:\\nOld: https://scikit-learn.org/0.22/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.get_feature_names\\nNew: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.get_feature_names\\nSolution: change the line dv.get_feature_names() to list(dv.get_feature_names_out))\\nIbai Irastorza', 'section': 'Miscellaneous', 'question': 'Get_feature_names() not found', 'course': 'machine-learning-zoomcamp', 'id': '77efd069'}\n", "{'text': 'Problem happens when contacting the server waiting to send your predict-test and your data here in the correct shape.\\nThe problem was the format input to the model wasn’t in the right shape. Server receives the data in json format (dict) which is not suitable for the model. U should convert it to like numpy arrays.\\nAhmed Okka', 'section': 'Miscellaneous', 'question': 'Error decoding JSON response: Expecting value: line 1 column 1 (char 0)', 'course': 'machine-learning-zoomcamp', 'id': 'cc60f7bc'}\n", "{'text': \"Q: Hii folks, I tried deploying my docker image on Render, but it won't I get SIGTERM everytime.\\nI think .5GB RAM is not enough, is there any other free alternative available ?\\nA: aws (amazon), gcp (google), saturn.\\nBoth aws and gcp give microinstance for free for a VERY long time, and a bunch more free stuff.\\nSaturn even provides free GPU instances. Recent promo link from mlzoomcamp for Saturn:\\n“You can sign up here: https://bit.ly/saturn-mlzoomcamp\\nWhen you sign up, write in the chat box that you're an ML Zoomcamp student and you should get extra GPU hours (something like 150)”\\nAdded by Andrii Larkin\", 'section': 'Miscellaneous', 'question': 'Free cloud alternatives', 'course': 'machine-learning-zoomcamp', 'id': 'aa13dd66'}\n", "{'text': \"Problem description: I have one column day_of_the_month . It has values 1, 2, 20, 25 etc. and int . I have a second column month_of_the_year. It has values jan, feb, ..dec. and are string. I want to convert these two columns into one column day_of_the_year and I want them to be int. 2 and jan should give me 2, i.e. 2nd day of the year, 1 and feb should give me 32, i.e. 32 nd day of the year. What is the simplest pandas-way to do it?\\nSolution description:\\nconvert dtype in day_of_the_month column from int to str with df['day_of_the_month'] = df['day_of_the_month'].map(str)\\nconvert month_of_the_year column in jan, feb ...,dec into 1,2, ..,12 string using map()\\nconvert day and month into a datetime object with:\\ndf['date_formatted'] = pd.to_datetime(\\ndict(\\nyear='2055',\\nmonth=df['month'],\\nday=df['day']\\n)\\n)\\nget day of year with: df['day_of_year']=df['date_formatted'].dt.dayofyear\\n(Bhaskar Sarma)\", 'section': 'Miscellaneous', 'question': 'Getting day of the year from day and month column', 'course': 'machine-learning-zoomcamp', 'id': 'c41e479c'}\n", "{'text': 'How to visualize the predictions per classes after training a neural net\\nSolution description\\nclasses, predictions = zip(*dict(zip(classes, predictions)).items())\\nplt.figure(figsize=(12, 3))\\nplt.bar(classes, predictions)\\nLuke', 'section': 'Miscellaneous', 'question': 'Chart for classes and predictions', 'course': 'machine-learning-zoomcamp', 'id': '2f28dcf1'}\n", "{'text': 'You can convert the prediction output values to a datafarme using \\ndf = pd.DataFrame.from_dict(dict, orient=\\'index\\' , columns=[\"Prediction\"])\\nEdidiong Esu', 'section': 'Miscellaneous', 'question': 'Convert dictionary values to Dataframe table', 'course': 'machine-learning-zoomcamp', 'id': '7a69cccf'}\n", "{'text': 'The image dataset for the competition was in a different layout from what we used in the dino vs dragon lesson. Since that’s what was covered, some folks were more comfortable with that setup, so I wrote a script that would generate it for them\\nIt can be found here: kitchenware-dataset-generator | Kaggle\\nMartin Uribe', 'section': 'Miscellaneous', 'question': 'Kitchenware Classification Competition Dataset Generator', 'course': 'machine-learning-zoomcamp', 'id': '20174c95'}\n", "{'text': 'Install Nvidia drivers: https://www.nvidia.com/download/index.aspx.\\nWindows:\\nInstall Anaconda prompt https://www.anaconda.com/\\nTwo options:\\nInstall package ‘tensorflow-gpu’ in Anaconda\\nInstall the Tensorflow way https://www.tensorflow.org/install/pip#windows-native\\nWSL/Linux:\\nWSL: Use the Windows Nvida drivers, do not touch that.\\nTwo options:\\nInstall the Tensorflow way https://www.tensorflow.org/install/pip#linux_1\\nMake sure to follow step 4 to install CUDA by environment\\nAlso run:\\necho ‘export XLA_FLAGS=--xla_gpu_cuda_data_dir=$CONDA_PREFIX/lib/> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh\\nInstall CUDA toolkit 11.x.x https://developer.nvidia.com/cuda-toolkit-archive\\nInstall https://developer.nvidia.com/rdp/cudnn-download\\nNow you should be able to do training/inference with GPU in Tensorflow\\n(Learning in public links Links to social media posts where you share your progress with others (LinkedIn, Twitter, etc). Use #mlzoomcamp tag. The scores for this part will be capped at 7 points. Please make sure the posts are valid URLs starting with \"https://\" Does it mean that I should provide my linkedin link? or it means that I should write a post that I have completed my first assignement? (\\nANS (by ezehcp7482@gmail.com): Yes, provide the linkedIN link to where you posted.\\nezehcp7482@gmail.com:\\nPROBLEM: Since I had to put up a link to a public repository, I had to use Kaggle and uploading the dataset therein was a bit difficult; but I had to ‘google’ my way out.\\nANS: See this link for a guide (https://www.kaggle.com/code/dansbecker/finding-your-files-in-kaggle-kernels/notebook)', 'section': 'Miscellaneous', 'question': 'CUDA toolkit and cuDNN Install for Tensorflow', 'course': 'machine-learning-zoomcamp', 'id': 'f2cd48b6'}\n", "{'text': 'When multiplying matrices, the order of multiplication is important.\\nFor example:\\nA (m x n) * B (n x p) = C (m x p)\\nB (n x p) * A (m x n) = D (n x n)\\nC and D are matrices of different sizes and usually have different values. Therefore the order is important in matrix multiplication and changing the order changes the result.\\nBaran Akın', 'section': 'Miscellaneous', 'question': 'About getting the wrong result when multiplying matrices', 'course': 'machine-learning-zoomcamp', 'id': '59b4324f'}\n", "{'text': 'Refer to https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/01-intro/06-environment.md\\n(added by Rileen Sinha)', 'section': 'Miscellaneous', 'question': 'None of the videos have how to install the environment in Mac, does someone have instructions for Mac with M1 chip?', 'course': 'machine-learning-zoomcamp', 'id': 'e1dc1ed9'}\n", "{'text': \"Depends on whether the form will still be open. If you're lucky and it's open, you can submit your homework and it will be evaluated. if closed - it's too late.\\n(Added by Rileen Sinha, based on answer by Alexey on Slack)\", 'section': 'Miscellaneous', 'question': 'I may end up submitting the assignment late. Would it be evaluated?', 'course': 'machine-learning-zoomcamp', 'id': 'fc60bf3b'}\n", "{'text': 'Yes. Whoever corrects the homework will only be able to access the link if the repository is public.\\n(added by Tano Bugelli)\\nHow to install Conda environment in my local machine?\\nWhich ide is recommended for machine learning?', 'section': 'Miscellaneous', 'question': 'Does the github repository need to be public?', 'course': 'machine-learning-zoomcamp', 'id': '1e60e888'}\n", "{'text': 'Install w get:\\n!which wget\\nDownload data:\\n!wget -P /content/drive/My\\\\ Drive/Downloads/ URL\\n(added by Paulina Hernandez)', 'section': 'Miscellaneous', 'question': 'How to use wget with Google Colab?', 'course': 'machine-learning-zoomcamp', 'id': '44552c2e'}\n", "{'text': \"Features (X) must always be formatted as a 2-D array to be accepted by scikit-learn.\\nUse reshape to reshape a 1D array to a 2D.\\n\\t\\t\\t\\t\\t\\t\\t(-Aileah) :>\\n(added by Tano\\nfiltered_df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]\\n# Select only the desired columns\\nselected_columns = [\\n'latitude',\\n'longitude',\\n'housing_median_age',\\n'total_rooms',\\n'total_bedrooms',\\n'population',\\n'households',\\n'median_income',\\n'median_house_value'\\n]\\nfiltered_df = filtered_df[selected_columns]\\n# Display the first few rows of the filtered DataFrame\\nprint(filtered_df.head())\", 'section': 'Miscellaneous', 'question': 'Features in scikit-learn?', 'course': 'machine-learning-zoomcamp', 'id': '7116b3be'}\n", "{'text': 'FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead', 'section': 'Miscellaneous', 'question': 'When I plotted using Matplot lib to check if median has a tail, I got the error below how can one bypass?', 'course': 'machine-learning-zoomcamp', 'id': '5d4d206e'}\n", "{'text': 'When trying to rerun the docker file in Windows, as opposed to developing in WSL/Linux, I got the error of:\\n```\\nWarning: Python 3.11 was not found on your system…\\nNeither ‘pipenv’ nor ‘asdf’ could be found to install Python.\\nYou can specify specific versions of Python with:\\n$ pipenv –python path\\\\to\\\\python\\n```\\nThe solution was to add Python311 installation folder to the PATH and restart the system and run the docker file again. That solved the error.\\n(Added by Abhijit Chakraborty)', 'section': 'Miscellaneous', 'question': 'Reproducibility in different OS', 'course': 'machine-learning-zoomcamp', 'id': '387093cc'}\n", "{'text': 'You may quickly deploy your project to DigitalOcean App Cloud. The process is relatively straightforward. The deployment costs about 5 USD/month. The container needs to be up until the end of the project evaluation.\\nSteps:\\nRegister in DigitalOcean\\nGo to Apps -> Create App.\\nYou will need to choose GitHub as a service provider.\\nEdit Source Directory (if your project is not in the repo root)\\nIMPORTANT: Go to settings -> App Spec and edit the Dockerfile path so it looks like ./project/Dockerfile path relative to your repo root\\nRemember to add model files if they are not built automatically during the container build process.\\nBy Dmytro Durach', 'section': 'Miscellaneous', 'question': 'Deploying to Digital Ocean', 'course': 'machine-learning-zoomcamp', 'id': 'd12a2657'}\n", "{'text': \"I’m just looking back at the lessons in week 3 (churn prediction project), and lesson 3.6 talks about Feature Importance for categorical values. At 8.12, the mutual info scores show that the some features are more important than others, but then in lesson 3.10 the Logistic Regression model is trained on all of the categorical variables (see 1:35). Once we have done feature importance, is it best to train your model only on the most important features?\\nNot necessarily - rather, any feature that can offer additional predictive value should be included (so, e.g. predict with & without including that feature; if excluding it drops performance, keep it, else drop it). A few individually important features might in fact be highly correlated with others, & dropping some might be fine. There are many feature selection algorithms, it might be interesting to read up on them (among the methods we've learned so far in this course, L1 regularization (Lasso) implicitly does feature selection by shrinking some weights all the way to zero).\\nBy Rileen Sinha\", 'section': 'Miscellaneous', 'question': 'Is it best to train your model only on the most important features?', 'course': 'machine-learning-zoomcamp', 'id': 'eb7a57a6'}\n", "{'text': 'You can consider several different approaches:\\nSampling: In the exploratory phase, you can use random samples of the data.\\nChunking: When you do need all the data, you can read and process it in chunks that do fit in the memory.\\nOptimizing data types: Pandas’ automatic data type inference (when reading data in) might result in e.g. float64 precision being used to represent integers, which wastes space. You might achieve substantial memory reduction by optimizing the data types.\\nUsing Dask, an open-source python project which parallelizes Numpy and Pandas.\\n(see, e.g. https://www.vantage-ai.com/en/blog/4-strategies-how-to-deal-with-large-datasets-in-pandas)\\nBy Rileen Sinha', 'section': 'Miscellaneous', 'question': 'How can I work with very large datasets, e.g. the New York Yellow Taxi dataset, with over a million rows?', 'course': 'machine-learning-zoomcamp', 'id': 'd6f0c6ea'}\n", "{'text': 'Technically, yes. Advisable? Not really. Reasons:\\nSome homework(s) asks for specific python library versions.\\nAnswers may not match in MCQ options if using different languages other than Python 3.10 (the recommended version for 2023 cohort)\\nAnd as for midterms/capstones, your peer-reviewers may not know these other languages. Do you want to be penalized for others not knowing these other languages?\\nYou can create a separate repo using course’s lessons but written in other languages for your own learnings, but not advisable for submissions.\\ntx[source]', 'section': 'Miscellaneous', 'question': 'Can I do the course in other languages, like R or Scala?', 'course': 'machine-learning-zoomcamp', 'id': '9f261648'}\n", "{'text': 'Yes, it’s allowed (as per Alexey).\\nAdded By Rileen Sinha', 'section': 'Miscellaneous', 'question': 'Is use of libraries like fast.ai or huggingface allowed in the capstone and competition, or are they considered to be \"too much help\"?', 'course': 'machine-learning-zoomcamp', 'id': 'aa7ff0f7'}\n", "{'text': 'The TF and TF Serving versions have to match (as per solution from the slack channel)\\nAdded by Chiedu Elue', 'section': 'Miscellaneous', 'question': 'Flask image was built and tested successfully, but tensorflow serving image was built and unable to test successfully. What could be the problem?', 'course': 'machine-learning-zoomcamp', 'id': '387bdc5f'}\n", "{'text': 'I’ve seen LinkedIn users list DataTalksClub as Experience with titles as:\\nMachine Learning Fellow\\nMachine Learning Student\\nMachine Learning Participant\\nMachine Learning Trainee\\nPlease note it is best advised that you do not list the experience as an official “job” or “internship” experience since DataTalksClub did not hire you, nor financially compensate you.\\nOther ways you can incorporate the experience in the following sections:\\nOrganizations\\nProjects\\nSkills\\nFeatured\\nOriginal posts\\nCertifications\\nCourses\\nBy Annaliese Bronz\\nInteresting question, I put the link of my project into my CV as showcase and make posts to show my progress.\\nBy Ani Mkrtumyan', 'section': 'Miscellaneous', 'question': 'Any advice for adding the Machine Learning Zoomcamp experience to your LinkedIn profile?', 'course': 'machine-learning-zoomcamp', 'id': 'c6a22665'}\n" ] } ], "source": [ "# Filtering for 'machine-learning-zoomcamp' course\n", "filtered_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']\n", "\n", "# Print the filtered documents\n", "for doc in filtered_documents:\n", " print(doc)" ] }, { "cell_type": "code", "execution_count": 5, "id": "4eb50cc4-891f-473f-9705-3d405ac38495", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "375" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(filtered_documents)" ] }, { "cell_type": "markdown", "id": "96e41f18-fc8c-499c-b364-7fee2cb2d99c", "metadata": {}, "source": [ "\n", "Q2. Creating the embeddings" ] }, { "cell_type": "code", "execution_count": 6, "id": "904b7d36-df77-4d9a-868c-12f2afc2f517", "metadata": {}, "outputs": [], "source": [ "# Create embeddings for each document\n", "embeddings = []\n", "for doc in filtered_documents:\n", " qa_text = f\"{doc['question']} {doc['text']}\"\n", " embedding = model.encode(qa_text)\n", " embeddings.append(embedding)" ] }, { "cell_type": "code", "execution_count": 7, "id": "63eca091-c9fd-404f-8355-8f03b343b8cb", "metadata": {}, "outputs": [], "source": [ "# Convert the list of embeddings to a NumPy array\n", "import numpy as np\n", "X = np.array(embeddings)" ] }, { "cell_type": "code", "execution_count": 8, "id": "5da149ef-3233-4ace-aed0-7c5f7bc19b92", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(375, 768)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape\n", "\n", "#(375, 768)" ] }, { "cell_type": "markdown", "id": "6e088d31-7380-407c-97fe-501cfb54655f", "metadata": {}, "source": [ "Q3. Search" ] }, { "cell_type": "code", "execution_count": 9, "id": "bfab0230-ad52-4666-a87b-29098fa3bece", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 2.89217263e-01 4.35050726e-01 2.20572561e-01 1.28085926e-01\n", " 3.28754455e-01 4.58999664e-01 4.48930591e-01 2.70170599e-01\n", " 2.13975310e-01 3.39745760e-01 1.63339168e-01 3.15918088e-01\n", " 1.53621323e-02 4.14240420e-01 6.50657296e-01 4.12786484e-01\n", " 2.02390462e-01 4.71707582e-01 3.74821007e-01 3.53319108e-01\n", " 4.50558245e-01 1.85430944e-01 3.45526993e-01 8.22948217e-02\n", " -2.19077598e-02 -4.64250743e-02 8.60690773e-02 9.87197682e-02\n", " -2.14188285e-02 1.28751006e-02 2.54870541e-02 3.66866291e-02\n", " 4.56338078e-02 -2.05279887e-02 1.82662532e-02 3.39789167e-02\n", " 1.66510604e-03 1.03209512e-02 3.40170749e-02 -2.65231431e-02\n", " -4.11909968e-02 -4.97983992e-02 -6.20346665e-02 -7.63561390e-03\n", " 4.61616889e-02 6.95944112e-03 4.77613509e-02 -9.97766852e-04\n", " -5.60126789e-02 -5.96206151e-02 2.05209196e-01 -1.16968602e-01\n", " -2.34160442e-02 -1.52456751e-02 -6.83515333e-03 -8.61715078e-02\n", " 2.02636905e-02 -2.51591802e-02 4.21313569e-04 -3.79212126e-02\n", " -3.21442187e-02 2.48612404e-01 2.12661266e-01 2.87320912e-02\n", " 5.64009324e-03 2.83871107e-02 1.02311984e-01 1.08430505e-01\n", " 4.45670784e-02 5.62139153e-02 -1.25657022e-01 6.16888516e-03\n", " 7.16239028e-03 -5.59204817e-02 -4.92570847e-02 -6.11873008e-02\n", " 2.45293360e-02 1.35819921e-02 -4.08511348e-02 -1.22760013e-02\n", " 7.87671432e-02 -4.09776270e-02 2.84435414e-03 3.54050398e-02\n", " 7.06101395e-03 -7.25120008e-02 8.09859335e-02 2.13899747e-01\n", " 5.30421622e-02 5.03777303e-02 -3.79603878e-02 -2.42293589e-02\n", " -1.34442877e-02 7.92892426e-02 -4.85001504e-03 -4.29111272e-02\n", " -3.20610777e-02 6.69904500e-02 -8.40574503e-03 -1.32086679e-01\n", " -1.15126744e-03 4.14728746e-02 -1.31331682e-01 -3.80731598e-02\n", " 6.47669658e-03 8.36595669e-02 2.45577283e-02 -6.55222088e-02\n", " -6.91720098e-02 5.12287356e-02 4.83919308e-03 5.39997071e-02\n", " -2.80694813e-02 -7.57023320e-02 -1.05883174e-01 2.03202926e-02\n", " 1.69166885e-02 -2.74919197e-02 7.05349073e-03 -6.26004562e-02\n", " -1.31930158e-01 5.10980301e-02 1.47758424e-01 -3.40155177e-02\n", " -3.57092805e-02 1.55357812e-02 -1.18725169e-02 7.73690343e-02\n", " 4.35771830e-02 -6.61863387e-02 2.52170742e-01 4.70675789e-02\n", " -7.70067349e-02 3.79015319e-02 1.35742918e-01 -7.73299113e-03\n", " -4.33597751e-02 7.42026642e-02 1.12941302e-02 -4.40411195e-02\n", " 3.28043491e-01 3.83172673e-03 -4.73492593e-03 7.93434680e-04\n", " -6.67457879e-02 1.09346984e-02 6.56736409e-03 -2.20774859e-03\n", " -2.48781964e-03 9.11681913e-03 -1.64973717e-02 6.57348335e-02\n", " 1.86012536e-01 1.81024328e-01 1.19437790e-02 9.31539610e-02\n", " 8.77034366e-02 4.60377932e-02 -3.29313613e-02 8.97290111e-02\n", " 4.92408499e-02 -1.51443370e-02 1.82684168e-01 -9.11686942e-03\n", " 3.82711589e-02 -2.85148025e-02 -2.32421439e-02 -1.17940299e-01\n", " 8.04636329e-02 -3.96181978e-02 -2.93325000e-02 -2.86628045e-02\n", " 5.59822991e-02 2.05439404e-02 4.35311198e-02 8.72714818e-03\n", " 9.05860588e-02 4.16058786e-02 -1.07162185e-01 -1.74158234e-02\n", " 3.76871414e-02 5.74276876e-03 7.82680511e-02 -1.67808235e-02\n", " -2.49322653e-02 1.17247760e-01 1.92652643e-02 2.24364530e-02\n", " 6.70092329e-02 1.04418322e-01 9.30422395e-02 1.11063994e-01\n", " -1.55649493e-02 -8.05713236e-04 1.06892414e-01 -5.77541143e-02\n", " 9.22655687e-02 8.57100189e-02 -5.62388683e-04 6.70872852e-02\n", " -8.66511464e-03 1.68558955e-02 6.78131580e-02 3.23789530e-02\n", " 1.53096408e-01 3.46212648e-03 9.72392708e-02 7.46936211e-03\n", " 5.92091754e-02 -4.15473524e-03 8.42873007e-02 -2.21289136e-02\n", " -2.13755090e-02 -3.08731589e-02 1.13827147e-01 -5.18075898e-02\n", " -3.82553935e-02 1.21785142e-01 2.12656818e-02 1.00383937e-01\n", " 1.02448642e-01 -8.57205689e-02 9.46157947e-02 -5.88218216e-03\n", " -8.54180753e-02 -5.48273176e-02 -2.86803544e-02 6.51872009e-02\n", " -8.90592113e-02 2.25946158e-02 2.30063684e-03 1.26689568e-01\n", " 9.92652178e-02 1.21937200e-01 8.56228080e-03 5.68378121e-02\n", " 1.07429951e-01 3.81353945e-02 3.70678753e-02 5.04433773e-02\n", " 5.89763038e-02 1.46542443e-02 5.39241508e-02 -1.02877337e-03\n", " 4.38618138e-02 -5.49201667e-02 9.17539969e-02 8.76395553e-02\n", " 1.08017325e-02 6.35476485e-02 7.61533678e-02 -1.20977782e-01\n", " 3.12572382e-02 1.07315421e-01 2.11034007e-02 -1.16809994e-01\n", " -7.75170624e-02 -3.13896909e-02 9.74870473e-03 -2.51133107e-02\n", " 7.61508755e-03 -5.23268394e-02 8.75003859e-02 1.02829516e-01\n", " 5.21384589e-02 1.17952842e-02 -6.86213821e-02 -3.29189152e-02\n", " 1.25220809e-02 4.99134064e-02 1.76850818e-02 5.28945029e-02\n", " 3.63487080e-02 -3.74851450e-02 4.61158343e-03 -1.40474707e-01\n", " -8.45709145e-02 -1.63267963e-02 -2.33845972e-03 9.91798490e-02\n", " 8.12100172e-02 -1.15462132e-02 -1.46434102e-02 5.92482202e-02\n", " -2.45577227e-02 2.28130259e-02 1.75959989e-02 4.81145680e-02\n", " 1.09545803e-02 -2.97679640e-02 -4.04076744e-03 8.05019587e-02\n", " 5.23830578e-02 6.93680868e-02 4.00670059e-03 -1.52226947e-02\n", " -5.35538932e-03 2.97432356e-02 2.86108106e-02 5.79058938e-03\n", " 8.26219991e-02 -6.90139038e-03 8.27184319e-02 3.15404758e-02\n", " 7.90223330e-02 1.35424241e-01 -6.60498068e-03 3.40357237e-02\n", " -6.04556128e-02 2.99435109e-04 4.17965464e-02 2.76403073e-02\n", " 2.20877137e-02 8.75887126e-02 3.75620797e-02 1.02920746e-02\n", " 2.79956069e-02 1.11261234e-01 -4.82459925e-02 2.56313700e-02\n", " 1.01505127e-02 1.10233277e-01 8.72281641e-02 1.99888915e-01\n", " 1.58133179e-01 5.12097310e-03 -5.28743267e-02 1.38270974e-01\n", " 3.39298844e-02 3.29888582e-01 1.50600195e-01 9.16852653e-02\n", " 2.34558918e-02 2.25460321e-01 1.53260142e-01 -7.96020962e-03\n", " 1.47858635e-01 5.57383411e-02 4.80688773e-02 7.96168670e-03\n", " 1.04084693e-01 -1.46763816e-01 1.24577671e-01 5.08538447e-03\n", " 9.75179970e-02 -3.69484425e-02 -1.76295117e-02 6.71323538e-02\n", " -4.29844782e-02 8.52768496e-03 3.95143740e-02 4.70580570e-02\n", " -2.83394046e-02 -2.89895721e-02 -4.38462347e-02 -1.16549619e-02\n", " 4.59900014e-02 1.22447908e-02 9.40718427e-02 6.48717359e-02\n", " -5.06505966e-02 1.39609903e-01 1.91962361e-01 1.55291200e-01\n", " -1.94996037e-02 1.88638922e-02 6.77662715e-03 3.05332206e-02\n", " 2.56960243e-02 7.66609609e-02 -7.14630112e-02 4.56990242e-01\n", " 1.33357674e-01 6.59829676e-02 2.52122730e-01]\n" ] } ], "source": [ "# Example query\n", "query = \"I just discovered the course. Can I still join it?\"\n", "\n", "# Compute the embedding for the query\n", "v = model.encode(query)\n", "\n", "# Compute the cosine similarity scores\n", "scores = X.dot(v)\n", "\n", "# Print the scores\n", "print(scores)" ] }, { "cell_type": "code", "execution_count": 10, "id": "cb5742d8-6636-4a96-b54b-69ff999da567", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The highest score is: 0.6506572961807251\n" ] } ], "source": [ "# Find and print the highest score\n", "highest_score = np.max(scores)\n", "print(f'The highest score is: {highest_score}')\n", "\n", "#0.65" ] }, { "cell_type": "code", "execution_count": 11, "id": "c94295b1-e09e-489a-8c52-e1394611c690", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',\n", " 'section': 'General course-related questions',\n", " 'question': 'The course has already started. Can I still join it?',\n", " 'course': 'machine-learning-zoomcamp',\n", " 'id': 'ee58a693'},\n", " {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\\nOr you can just use this link: http://mlzoomcamp.com/#syllabus',\n", " 'section': 'General course-related questions',\n", " 'question': 'I just joined. What should I do next? How can I access course materials?',\n", " 'course': 'machine-learning-zoomcamp',\n", " 'id': '0a278fb2'},\n", " {'text': \"The process is automated now, so you should receive the email eventually. If you haven’t, check your promotions tab in Gmail as well as spam.\\nIf you unsubscribed from our newsletter, you won't get course related updates too.\\nBut don't worry, it’s not a problem. To make sure you don’t miss anything, join the #course-ml-zoomcamp channel in Slack and our telegram channel with announcements. This is enough to follow the course.\",\n", " 'section': 'General course-related questions',\n", " 'question': \"I filled the form, but haven't received a confirmation email. Is it normal?\",\n", " 'course': 'machine-learning-zoomcamp',\n", " 'id': '6ba259b1'},\n", " {'text': 'Technically, yes. Advisable? Not really. Reasons:\\nSome homework(s) asks for specific python library versions.\\nAnswers may not match in MCQ options if using different languages other than Python 3.10 (the recommended version for 2023 cohort)\\nAnd as for midterms/capstones, your peer-reviewers may not know these other languages. Do you want to be penalized for others not knowing these other languages?\\nYou can create a separate repo using course’s lessons but written in other languages for your own learnings, but not advisable for submissions.\\ntx[source]',\n", " 'section': 'Miscellaneous',\n", " 'question': 'Can I do the course in other languages, like R or Scala?',\n", " 'course': 'machine-learning-zoomcamp',\n", " 'id': '9f261648'},\n", " {'text': 'We won’t re-record the course videos. The focus of the course and the skills we want to teach remained the same, and the videos are still up-to-date.\\nIf you haven’t taken part in the previous iteration, you can start watching the videos. It’ll be useful for you and you will learn new things. However, we recommend using Python 3.10 now instead of Python 3.8.',\n", " 'section': 'General course-related questions',\n", " 'question': 'The course videos are from the previous iteration. Will you release new ones or we’ll use the videos from 2021?',\n", " 'course': 'machine-learning-zoomcamp',\n", " 'id': 'e7ba6b8a'}]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "class VectorSearchEngine():\n", " def __init__(self, documents, embeddings):\n", " self.documents = documents\n", " self.embeddings = embeddings\n", "\n", " def search(self, v_query, num_results=10):\n", " scores = self.embeddings.dot(v_query)\n", " idx = np.argsort(-scores)[:num_results]\n", " return [self.documents[i] for i in idx]\n", "\n", "search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)\n", "search_engine.search(v, num_results=5)" ] }, { "cell_type": "markdown", "id": "03d23eb9-1163-4596-b6de-d9cbcf6bdde1", "metadata": {}, "source": [ "Q4. Hit-rate for our search engine" ] }, { "cell_type": "code", "execution_count": 12, "id": "010bd49f-42f9-49d5-801a-da033c12ec7d", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'\n", "relative_url = '03-vector-search/eval/ground-truth-data.csv'\n", "ground_truth_url = f'{base_url}/{relative_url}?raw=1'\n", "\n", "df_ground_truth = pd.read_csv(ground_truth_url)\n", "df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']\n", "ground_truth = df_ground_truth.to_dict(orient='records')" ] }, { "cell_type": "code", "execution_count": 14, "id": "50b76020-9218-43c6-9b3a-b5610c75b94a", "metadata": {}, "outputs": [], "source": [ "# Calculate the hit rate\n", "num_results = 5\n", "hits = 0\n", "\n", "for item in ground_truth:\n", " query = item['question']\n", " ground_truth_id = item['document'] # Using 'document' as the identifier\n", " \n", " # Compute the embedding for the query\n", " v_query = model.encode(query)\n", " \n", " # Get the top N results using the search engine\n", " results = search_engine.search(v_query, num_results=num_results)\n", " \n", " # Check if the ground truth id (document) is in the top N results\n", " result_ids = [res['id'] for res in results]\n", " if ground_truth_id in result_ids:\n", " hits += 1\n", "\n", "# Calculate hit rate\n", "hit_rate = hits / len(ground_truth)\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "130a77c0-b2db-4bd1-86db-14e4c8852322", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9398907103825137" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hit_rate\n", "#0.93" ] }, { "cell_type": "markdown", "id": "bd482b38-8aa0-4037-bf75-aa5994c65734", "metadata": {}, "source": [ "Question 5:" ] }, { "cell_type": "code", "execution_count": 18, "id": "5028b710-0e08-40d3-bb6b-4741dfea4b26", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ObjectApiResponse({'name': 'c06cb69b08f1', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'Ynd8JJwvRV-c1nzcssX1qg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from elasticsearch import Elasticsearch\n", "\n", "es_client = Elasticsearch(\"http://localhost:9200\")\n", "es_client.info()" ] }, { "cell_type": "code", "execution_count": 29, "id": "b75b1b76-19b6-43c6-a24c-80436b9b6f78", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e28c912636dd4ec5968cf8dfa8bb5166", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/375 [00:00