{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "# unzip ml-20.zip into cwd\n", "movies = pd.read_csv('movies.csv')\n", "ratings = pd.read_csv('ratings.csv')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdratingtimestamptitlegenres
013.51112486027Grumpier Old Men (1995)Comedy|Romance
113.51112484676Shanghai Triad (Yao a yao yao dao waipo qiao) ...Crime|Drama
213.51112484819Wings of Courage (1995)Adventure|Romance|IMAX
313.51112484727Pocahontas (1995)Animation|Children|Drama|Musical|Romance
413.51112484580Guardian Angel (1994)Action|Drama|Thriller
\n", "
" ], "text/plain": [ " userId rating timestamp \\\n", "0 1 3.5 1112486027 \n", "1 1 3.5 1112484676 \n", "2 1 3.5 1112484819 \n", "3 1 3.5 1112484727 \n", "4 1 3.5 1112484580 \n", "\n", " title \\\n", "0 Grumpier Old Men (1995) \n", "1 Shanghai Triad (Yao a yao yao dao waipo qiao) ... \n", "2 Wings of Courage (1995) \n", "3 Pocahontas (1995) \n", "4 Guardian Angel (1994) \n", "\n", " genres \n", "0 Comedy|Romance \n", "1 Crime|Drama \n", "2 Adventure|Romance|IMAX \n", "3 Animation|Children|Drama|Musical|Romance \n", "4 Action|Drama|Thriller " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = ratings.join(movies, on='movieId', rsuffix='drop').drop(columns=['movieId', 'movieIddrop'])\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(27278, 20000263, 733.2012244299435)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(movies), len(ratings), len(ratings) / len(movies)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.title.to_pickle('title')" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "titles = pd.read_pickle('title').fillna('null')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Grumpier Old Men (1995)\n", "1 Shanghai Triad (Yao a yao yao dao waipo qiao) ...\n", "2 Wings of Courage (1995)\n", "3 Pocahontas (1995)\n", "4 Guardian Angel (1994)\n", "Name: title, dtype: object" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "titles.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "20000263" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(titles)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9260" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_titles = titles.astype(\n", " pd.api.types.CategoricalDtype(\n", " pd.unique(titles)))\n", "len(cat_titles.cat.categories)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "20000263" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(cat_titles.cat.codes)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAEWCAYAAACufwpNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xm8XVV9///X+06ZZ0IIGQhDAAMCShqgqEXRilQFK2BQ\nS1AqtWCrX+sArVb8VSy0dcIBpKAMohBRJFoiZRCpIIEgCAQICSQhiWROyESGe+/n98daJ/fkcod9\nwz13yH0/H4/zOHuvPX3OPsn53LXW3msrIjAzM+sMVd0dgJmZ7T2cVMzMrNM4qZiZWadxUjEzs07j\npGJmZp3GScXMzDqNk4q1S9JsSTO6Ow6zzibpKklf7O449ibyfSpmey9Jk4BFQG1E1HdvNN1L0rnA\n30bEm7o7lr2Zayq215NU090xWGX5O+45nFR6MUmLJX1W0hOStki6VtKY3Fy1SdLdkkaUrf9eSfMk\nbZB0n6TX5fLPS7q12b6/JemKPH2fpL8tW/ZRSc9IWi/pTkkHtBHjTyWtkPSypPslHVG27Lrc/HBX\njve35fuSFJL+UdILktZI+k9JVWXLW40jb3uhpAXAgrLPtFTSRkmPSnpz2fqXSJop6YYcyzxJU8uW\nT5D0c0mrJa2V9J09PB9vkvRg/g6W5r+ekTQsH3u1pCWSvlD6rDm2H5XtY1L+fDVl38+/SXogx/6/\nkvbJq9+f3zdI2izphBZimibp9zmmlyR9R1Jds3N5gaQFef//Jung/Dk25vNWvv7HJC2UtE7SLEn7\nN9vXx/O+Nkj6riTlZdWSvpa/60WSPlH+OVuIe3H+t/sEsEVSjaSLJD2f43xa0vvyuq8DrgJOyOdh\nQy6/TtJX8vRJkpZJ+idJq/K5+EjZ8UZJ+mX+zI9I+oqk3+VlkvSNvN1GSU9KOrK1fwd7tYjwq5e+\ngMXAQ8AYYBywCvgD8AagP3Av8KW87qHAFuAdQC3wOWAhUAccAGwFhuR1q4GXgOPz/H2kZgOA0/J2\nrwNqgC8AD7YR40eBIUA/4JvA42XLrgM2AW/Jy78F/K5seQC/AUYCE4HnisaRt70rbzsgl30YGJXX\n/ydgBdA/L7sE2Aacmj//vwMPlZ2PPwLfAAblc/umjp6PfJ43AWfn72AUcExedgNwez5Xk/JnPa8s\nth+V7WdS/nw1Zd/P8/k7HpDnL2tp3VbiOhY4Psc/CXgG+FSzc3k7MBQ4AtgO3AMcBAwDngZm5HXf\nBqwB3pi/028D9zfb16+A4fk7XQ2ckpd9PO9rPDACuLut2En//h8HJpR9x2cC+5P+YP4A6d/82Lzs\nXMr+fZX9G/xKnj4JqAf+v/z9nEr6fzEiL785vwYCU4Clpf0B7wQezZ9L+d/D2O7+jeiW36XuDsCv\n1/Dlpf9UHyqb/xlwZdn8PwC/yNNfBGaWLasClgMn5fnfAefk6XcAz5etex9NP+azyT92ZfvZChxQ\nIN7h+UdiWJ6/Dri5bPlgoAGYkOej9IOT5y8A7ikSR972be3Esx44Ok9fAtxdtmwK8EqePiH/+L3q\nx60j5wO4GLithfJqYAcwpazs74D7ymJrL6l8odl5+nVL6xb8d/Wp8jjz9ieWzT8KfL5s/mvAN/P0\ntcB/NPtOdwKTyvb1prLlM4GL8vS9wN+VLXt7W7GT/v1/tJ3P8jhwWp4+l/aTyivlxyP9oXZ8/o52\nAoeVLfsKTUnlbaQ/BI4Hqjr6f3lvern5q/dbWTb9Sgvzg/P0/sCS0oKIaCT9pTUuF/2Y9Bc0wAfz\nfEsOAL6Vmy42AOtIf5mNa75ibs64LDdHbCT9CADsU7ba0rKYNuf97d/S8hx/aVmROMq3RdJncjPV\ny3mbYc1iWVE2vRXon5teJgBLouWO7sLnI+/n+RbK9yH9ZbykrGxJK/toTfPYB7e2YnOSDpX0K6Vm\nyo3AV9n9vMCe/zvbDKxl98/SWqz7s/t3ttv314rm3/E5kh4v+z6ObOGztGVts++5FN9oUk2uxfgi\n4l7gO8B3gVWSrpY0tAPH3Ws4qfQdfyL9AAKpDZj0I7c8F/0UOEnSeOB9tJ5UlpL+mhxe9hoQEQ+2\nsO4HSc1Dbyf9gE8qHb5snQllMQ0mNVf9qaXlpOaS0rIicey6tFGp/+RzwFmk5ozhwMvNYmnNUmBi\nK237HTkfS4GDWyhfQ/oruLwvZiJN380WUpNLyX4FYi4pcnnnlcCzwOSIGAr8M8XOS0ua/zsbRGrm\nW97qFk1eIjV9lUxobcUy5d/xAcB/A58ARuXv+CmaPstrudR1NalprNX4IuKKiDiWVMs9FPjsazhe\nr+Wk0nfMBP5K0smSakl9CtuBBwEiYjWpGeWHwKKIeKaV/VwFXKzc4Z47mM9sZd0h+RhrST+KX21h\nnVOVOq/rgH8j9WOU/zX4WUkjJE0APgncsgdxlGKpJzdjSfpXUh9BEQ+TfvAukzRIUn9JJ+5BHDcB\nb5d0Vu5UHiXpmIhoIH0/l0oakn8cPw2UOucfB94iaaKkYaRmtKJWA42k/o/WDAE2ApslHQ78fQf2\n39xPgI9IOkZSP9J3PiciFhfYdibwSUnjJA0HPt/BYw8iJY7VALmTvbyzfCUwvvyigqLyd/Rz4BJJ\nA/N5Oqe0XNKfSTou/9/aQuqfa+zocfYGTip9RETMJ3VUf5v0l/F7gPdExI6y1X5MqlW0VkshIm4D\nLgduzk0lTwHvamX1G0hNIctJHbAPtbDOj4EvkZqNjs0xlrud1Ib/OPA/pDb7jsYBcCfwa1K79xLS\nf/oizSulH5T3AIcALwLLSJ3AHYojIl4kdf7+U/68jwNH58X/QPoxeoHUv/Vj4Ad5u7tIyfSJfC5+\nVSTuvO1W4FLggdwkdHwLq32GVKvcRPpL/5YW1il6vLtJ/Xc/IyXig4HpBTf/b+B/SZ/zMeAO0h8C\nDQWP/TSpf+f3pATyeuCBslXuBeYBKyStKRhTuU+QatwrgBtJCXR7XjY0x7+e9O9rLfCfe3CMXs83\nP1q3kXQdsCwivtDK8iA1ySzs0sCsR5D0LuCqiGj1Eu3uJOlyYL+I8GgTZVxTMbMeQdIASafmpsFx\npBrsbd0dV4mkwyUdle9JmQacRw+Kr6dwUjGznkLAl0lNSI+R7pf5126NaHdDSP0qW0hNhF8jNc9a\nGTd/mZlZp6lYTUXSYfl68dJro6RPSRqpNCzHgvxePozIxUrDO8yX9M6y8mPzsAcLJV2RL4dFUj9J\nt+TyOUqD55mZWTfpkpqKpGrSFUDHARcC6yLiMkkXke4Z+LykKaSrKaaRboK6Gzg0IhokPQz8IzCH\ndEXIFRExW9IFwFER8XFJ04H3RcQH2opln332iUmTJlXok5qZ7Z0effTRNRExur31umpkz5NJw34s\nkXQaaTgEgOtJ90Z8nnST3M0RsR1YJGkhME3SYmBoRDwEIOkG4HTS8BinkYawALgV+I4kRRuZctKk\nScydO7dzP52Z2V5O0pL21+q6jvrppFoIwJiIeClPryANhghpGIfy+waW5bJxebp5+W7b5KEVXibd\nvbsbSedLmitp7urVq1/7pzEzsxZVPKnku1ffSxoGZDe5RlHx9reIuDoipkbE1NGj2629mZnZHuqK\nmsq7gD9ERGkAupWSxgLk91W5fDm7j6UzPpctZ/fxdkrlu22Tx2UaRrqT1czMukFXJJWzaWr6ApgF\nlO5AnUHTdd6zgOn5iq4DgcnAw7mpbKOk4/NVX+c026a0rzOAe9vqTzEzs8qqaEd9HqH0HaRnQ5Rc\nBsyUdB5pjJyzACJinqSZpDGi6oEL85hLkJ4PcR3pAUSz8wvSOFA35k79dRQfY8jMzCqgz938OHXq\n1PDVX2ZmHSPp0YiY2t56HqbFzMw6jZNKQY8sXsfX/3c+O+r75CMSzMwKcVIp6A9L1nPFvQupb3RS\nMTNrjZNKQcoPJG3sW11QZmYd4qRSUFXOKn3twgYzs45wUukg11TMzFrnpFJQqaZS+UFlzMx6LyeV\ngpr6VJxVzMxa46RS0K4+lW6Ow8ysJ3NSKcg1FTOz9jmpFKRdV391cyBmZj2Yk0pBuaLiS4rNzNrg\npFKQ+1TMzNrnpFKQ+1TMzNrnpFJQVek2FecUM7NWOakUVOqod03FzKx1TioFNXXUd2sYZmY9mpNK\nQVW+pNjMrF1OKgW5o97MrH1OKgX5kmIzs/Y5qRTkmoqZWfsqmlQkDZd0q6RnJT0j6QRJIyXdJWlB\nfh9Rtv7FkhZKmi/pnWXlx0p6Mi+7QvlSLEn9JN2Sy+dImlTBzwK4T8XMrC2Vrql8C/h1RBwOHA08\nA1wE3BMRk4F78jySpgDTgSOAU4DvSarO+7kS+BgwOb9OyeXnAesj4hDgG8DllfogHqbFzKx9FUsq\nkoYBbwGuBYiIHRGxATgNuD6vdj1wep4+Dbg5IrZHxCJgITBN0lhgaEQ8FOkX/YZm25T2dStwcqkW\n09ncp2Jm1r5K1lQOBFYDP5T0mKRrJA0CxkTES3mdFcCYPD0OWFq2/bJcNi5PNy/fbZuIqAdeBkY1\nD0TS+ZLmSpq7evXqPfow7lMxM2tfJZNKDfBG4MqIeAOwhdzUVZJrHhX/lY6IqyNiakRMHT169B7t\nw8O0mJm1r5JJZRmwLCLm5PlbSUlmZW7SIr+vysuXAxPKth+fy5bn6eblu20jqQYYBqzt9E8ClHpV\nXFMxM2tdxZJKRKwAlko6LBedDDwNzAJm5LIZwO15ehYwPV/RdSCpQ/7h3FS2UdLxub/knGbblPZ1\nBnBvVKgn3TUVM7P21VR4//8A3CSpDngB+Agpkc2UdB6wBDgLICLmSZpJSjz1wIUR0ZD3cwFwHTAA\nmJ1fkC4CuFHSQmAd6eqxivAlxWZm7atoUomIx4GpLSw6uZX1LwUubaF8LnBkC+XbgDNfY5iF7Kqp\n+PovM7NW+Y76gpqu/ureOMzMejInlYKamr+cVczMWuOkUlDpjkrXVMzMWuekUlDTjfrOKmZmrXFS\nKajKfSpmZu3qUFKRVCVpaKWC6cmELyk2M2tPu0lF0o8lDc3jdj0FPC3ps5UPrWdpuvnRWcXMrDVF\naipTImIjaWTg2aSBIv+molH1RG7+MjNrV5GkUiuplpRUZkXETvpgb3XT0Pd97qObmRVWJKlcBSwG\nBgH3SzoA2FjJoHqipod0dWsYZmY9WpvDtEiqAlZGxLiysheBt1Y6sJ6mqsod9WZm7WmzphIRjcDn\nmpVFfiBWn9J086OziplZa4o0f90t6TOSJkgaWXpVPLIeRn6csJlZu4qMUvyB/H5hWVkAB3V+OD2X\nHydsZta+dpNKRBzYFYH0dKWrv1xVMTNrXaHnqUg6EpgC9C+VRcQNlQqqJ3KfiplZ+9pNKpK+BJxE\nSip3AO8Cfgf0qaRS5Sc/mpm1q0hH/RmkJzWuiIiPAEcDwyoaVQ/kPhUzs/YVSSqv5EuL6/NgkquA\nCZUNq+dxl4qZWfuK9KnMlTQc+G/gUWAz8PuKRtUDNY1S7LRiZtaadmsqEXFBRGyIiKuAdwAzcjNY\nuyQtlvSkpMclzc1lIyXdJWlBfh9Rtv7FkhZKmi/pnWXlx+b9LJR0hfJNI5L6Sboll8+RNKljH7+4\nqnymnFPMzFpXZOh7SfqwpH+NiMXABknTOnCMt0bEMRExNc9fBNwTEZOBe/I8kqYA04EjgFOA70mq\nzttcCXwMmJxfp+Ty84D1EXEI8A3g8g7E1SGlmopHKTYza12RPpXvAScAZ+f5TcB3X8MxTwOuz9PX\nk0Y/LpXfHBHbI2IRsBCYJmksMDQiHorU9nRDs21K+7oVOLlUi+lsVe6oNzNrV5GkclxEXAhsA4iI\n9UBdwf0HaZiXRyWdn8vGRMRLeXoFMCZPjwOWlm27LJeNy9PNy3fbJo9H9jIwqnkQks6XNFfS3NWr\nVxcM/VX7AJxUzMzaUqSjfmduhgoASaOBxoL7f1NELJe0L3CXpGfLF0ZESKr4r3REXA1cDTB16tQ9\nOl5Nrqo0uP3LzKxVRWoqVwC3AWMkXUq68fGrRXYeEcvz+6q8j2nAytykRX5flVdfzu6XKo/PZcvz\ndPPy3baRVEO6f2Ztkdg6qtpJxcysXUWu/rqJNPz9V4E/AadHxE/b207SIElDStPAX5KecT8LmJFX\nmwHcnqdnAdPzFV0HkjrkH85NZRslHZ/7S85ptk1pX2cA90aFrvmtqXZSMTNrT6Gxv4CBQKkJbEDB\nbcYAt+W+iBrgxxHxa0mPADMlnQcsAc4CiIh5kmYCTwP1wIUR0ZD3dQFwXT727PwCuBa4UdJCYB3p\n6rGKqM59KvVOKmZmrSoy9te/AmcCPyONq/hDST+NiK+0tV1EvEAa0qV5+VrSsC8tbXMpcGkL5XOB\nI1so35Zjqzg3f5mZta9ITeVDwNH5BxxJlwGPA20mlb1NTb770TUVM7PWFemo/xNlQ94D/WjqKO8z\nSn0q9Q1FL3wzM+t7itRUXgbmSbqL1KfyDuBhSVcARMQ/VjC+HqO2OuXfnU4qZmatKpJUbsuvkvsq\nE0rPVptrKjsb3PxlZtaaIo8Tvl5SHXA4qaYyPyJ2VDyyHkYStdVyTcXMrA1Frv46Ffg+8Dzp6q8D\nJf1dRMxue8u9T211lZOKmVkbijR/fZ000vBCAEkHA/9D070ifUZNldz8ZWbWhiJXf20qJZTsBdJI\nxX1OXY1rKmZmbSn65Mc7gJmkPpUzgUck/TVARPy8gvH1KG7+MjNrW5Gk0h9YCfxFnl9NGi7lPaQk\n08eSipu/zMxaU+Tqr0KPDu4Lanz1l5lZm4pc/dWf9NjeIyi7sz4iPlrBuHqkOjd/mZm1qUhH/Y3A\nfsA7gd+SnmfSJzvq3fxlZta2IknlkIj4IrAlIq4H/go4rrJh9Uy11eKVHQ3tr2hm1kcVSSo78/sG\nSUeSnq64b+VC6rn2HdKf1Zu3d3cYZmY9VpGrv66WNAL4AulJi4OBL1Y0qh6qrqbKz1MxM2tDkau/\nrsmT9wMHVTacnq2mStQ3uqPezKw1RZq/LKuuEg3uqDcza5WTSgfUVIudbv4yM2uVk0oH1FS5T8XM\nrC3tJhVJAyV9UdJ/5/nJkt5d9ACSqiU9JulXeX6kpLskLcjvI8rWvVjSQknzJb2zrPxYSU/mZVdI\nUi7vJ+mWXD5H0qTiH73jqqvkxwmbmbWhSE3lh8B24IQ8vxz4SgeO8UngmbL5i4B7ImIycE+eR9IU\nYDrpzv1TgO9Jqs7bXAl8DJicX6fk8vOA9RFxCPAN4PIOxNVhNVVyTcXMrA1FksrBEfEf5PtVImIr\n6WFd7ZI0nnSz5DVlxacB1+fp64HTy8pvjojtEbEIWAhMkzQWGBoRD0VEADc026a0r1uBk0u1mEqo\nrhb1TipmZq0qklR2SBpAGpG49JCuoncAfhP4HFDeZjQmIl7K0yuAMXl6HLC0bL1luWxcnm5evts2\nEVEPvAyMah6EpPMlzZU0d/Xq1QVDfzXXVMzM2lYkqXwJ+DUwQdJNpCarz7W3Ue53WRURj7a2Tq55\nVPxXOiKujoipETF19OjRe7yfmqoq6huDFLaZmTVX5ObHuyT9ATie1Oz1yYhYU2DfJwLvzc+47w8M\nlfQjYKWksRHxUm7aWpXXXw5MKNt+fC5bnqebl5dvs0xSDWkImbUFYtsjNVWpZa2hMaiprlgrm5lZ\nr9VqTUXS4fn9jcABwEvAn4CJuaxNEXFxRIyPiEmkDvh7I+LDpKFeZuTVZgC35+lZwPR8RdeBpA75\nh3NT2UZJx+f+knOabVPa1xn5GBWrRvSvTdcNbN3pQSXNzFrSVk3l08D5wNdaWBbA2/bwmJcBMyWd\nBywBzgKIiHmSZgJPA/XAhRFR+vW+ALiO9MTJ2fkFcC1wo6SFwDpS8qqYfrUpB++o92XFZmYtaTWp\nRMT5+f2tr/UgEXEfcF+eXguc3Mp6lwKXtlA+FziyhfJtwJmvNb6iaqpSUqn3UC1mZi0qcvPjE/mm\nxIO7IqCerNSP4qc/mpm1rMjVX+8BGkhNVo9I+oykiRWOq0eqzUnF96qYmbWs3aQSEUsi4j8i4ljg\ng8BRwKKKR9YDNTV/uaZiZtaSIg/pQtIBwAfyq4EC96nsjWp3NX+5pmJm1pJ2k4qkOUAt8FPgzIh4\noeJR9VC11bmm4gd1mZm1qEhN5ZyImF/xSHqBmpxU3FFvZtayIh31KyR9vTR2lqSvSRpW8ch6oNoq\nN3+ZmbWlSFL5AbCJdJPiWcBG0nD4fU6ppuL7VMzMWlak+evgiHh/2fyXJT1eqYB6sl33qbhPxcys\nRUVqKq9IelNpRtKJwCuVC6nnqvUd9WZmbSpSU/l74PrcjyLSGFvnVjKonqpUU/F9KmZmLSsy9P3j\nwNGShub5jRWPqofadZ+K76g3M2tRkftUhpOGm58E1JSe1hsR/1jRyHqgof1rAVi3ueiDL83M+pYi\nzV93AA8BT7L7Y4H7nOED6wDYvL2+myMxM+uZiiSV/hHx6YpH0gvU1VRRV13FJicVM7MWFbn660ZJ\nH5M0VtLI0qvikfVQg/pVs8VJxcysRUVqKjuA/wT+hfTER/L7QZUKqicb0r+WTducVMzMWlIkqfwT\ncEhErKl0ML3B8IG1bNi6s7vDMDPrkYo0fy0EtlY6kN5i2IBaNrzipGJm1pIiNZUtwOOSfgPsupa2\nL15SDDBiYB0vrnOONTNrSZGk8ov8Mtz8ZWbWliJ31F+/JzuW1B+4H+iXj3NrRHwpXzl2C+lmysXA\nWRGxPm9zMXAe6emS/xgRd+byY4HrgAGk+2Y+GREhqR9wA3AssBb4QEQs3pN4ixo+oJaN23bS0BhU\n56HwzcwsKdKnsqe2A2+LiKOBY4BTJB0PXATcExGTgXvyPJKmANOBI4BTgO9Jqs77uhL4GDA5v07J\n5ecB6yPiEOAbwOUV/DxAugEyAja6X8XM7FUqllQi2Zxna/MrgNOAUu3neuD0PH0acHNEbI+IRaQL\nBKZJGgsMjYiHIiJINZPybUr7uhU4WaVxZCpkcL9UufNd9WZmr9ahpCKpqjSwZMH1q/OzV1YBd0XE\nHGBMRLyUV1kBjMnT44ClZZsvy2Xj8nTz8t22iYh64GVgVAtxnF96cuXq1auLht+iQTmpbN3R8Jr2\nY2a2N2o3qUj6saShkgYBTwFPS/pskZ1HRENEHAOMJ9U6jmy2PGi6obJiIuLqiJgaEVNHjx79mvY1\nsF9qkduywzUVM7PmitRUpuTh7k8HZgMHAn/TkYNExAbgN6S+kJW5SYv8viqvthyYULbZ+Fy2PE83\nL99tG0k1wDBSh33FlJq/PFSLmdmrFUkqtZJqSUllVkTspEDtQtLoPGw+kgYA7wCeBWYBM/JqM4Db\n8/QsYLqkfpIOJHXIP5ybyjZKOj73l5zTbJvSvs4A7s21n4opDX+/8RUnFTOz5orcp/J90qW/fwTu\nl3QAUORBXWNJT4ysJiWvmRHxK0m/B2ZKOg9YApwFEBHzJM0EngbqgQsjotRxcQFNlxTPzi+Aa0kD\nXi4kPZFyeoG4XpPhA1NS2fDKjkofysys1ylyn8oVwBVlRUskvbXAdk8Ab2ihfC1wcivbXApc2kL5\nXODIFsq3AWe2F0tnGjYgJZWXfUmxmdmrFOmoHyPpWkmz8/wUmpqc+pz+tdX0q6niZd9Vb2b2KkX6\nVK4D7gT2z/PPAZ+qVEC9gYdqMTNrWZGksk9EzCQ/SjjfD9Knb9IYNqDWzV9mZi0oklS2SBpFvuIr\nD7XyckWj6uGGD6hzR72ZWQuKXP31adKluwdLegAYTbp8t88aNrCWRWu2dHcYZmY9TpGrv/4g6S+A\nwwAB8/O9Kn3WyIF1PLZ1Q3eHYWbW47SbVPJ9JqeShqqvAf5SEhHx9QrH1mMNHVDjO+rNzFpQpPnr\nl8A24ElyZ31fN6hfDa/sbPAzVczMmimSVMZHxFEVj6QX2TX8/bZ6huU77M3MrNjVX7Ml/WXFI+lF\nxg4bAOBn1ZuZNVMkqTwE3CbpFUkbJW2SVGTsr73WIfsOBuCFNZvbWdPMrG8p0vz1deAE4MlKjwDc\nWxwwaiAAS11TMTPbTZGaylLgKSeUJv1rqxkztB9PLOvT94Camb1KkZrKC8B9eUDJ7aXCvnxJMcAJ\nB43igecr+jwwM7Nep0hNZRFwD1AHDCl79WkTRw1i9abt1Df4Kmszs5Iid9R/uSsC6W32GVwHwPqt\nOxk9pF83R2Nm1jO0mlQkfTMiPiXpl7Tw+OCIeG9FI+vhhg8sJZUdTipmZllbNZUb8/t/dUUgvc2E\nEelelcVrtnDomD7fGmhmBrTRpxIRj+bJYyLit+Uv4JiuCa/nmpwTyYJVvlfFzKykSEd9S48OPreT\n4+h1BverYdzwAcxfsam7QzEz6zFaTSqSzs79KQdKmlX2+g2wrr0dS5og6TeSnpY0T9Inc/lISXdJ\nWpDfR5Rtc7GkhZLmS3pnWfmxkp7My66QpFzeT9ItuXyOpEl7fio67tAxg3lupZOKmVlJW30qDwIv\nAfsAXysr3wQ8UWDf9cA/5eexDAEelXQXqZZzT0RcJuki4CLg85KmANOBI4D9gbslHRoRDcCVwMeA\nOcAdwCnAbOA8YH1EHCJpOnA58IFiH/21O3TMEB5YuJb6hkZqqotU+szM9m6tJpWIWAIsIQ3R0mER\n8RIpKRERmyQ9A4wDTgNOyqtdD9wHfD6X3xwR24FFkhYC0yQtBoZGxEMAkm4ATiclldOAS/K+bgW+\nI0lddff/5DFD2NHQyJJ1Wzl49OCuOKSZWY/WJX9e52apN5BqGmNywgFYAYzJ0+NIQ8KULMtl4/J0\n8/LdtomIeuBlYFQLxz9f0lxJc1evXt0Jnyg5dExKJAvcBGZmBnRBUpE0GPgZ8KmI2G1041yjqHit\nIiKujoipETF19OjRnbbf0mjFz630FWBmZlDhpCKplpRQboqIn+filZLG5uVjgVW5fDkwoWzz8bls\neZ5uXr7bNpJqgGFAlw3INbCuhgkjB7iz3swsazepSHq4bPrMojvOV2hdCzzTbPDJWTRdpjwDuL2s\nfHq+outAYDLwcG4q2yjp+LzPc5ptU9rXGcC9XT2a8iGjB/P86i1deUgzsx6rrWFaHiQ9l35fSYcD\nC4CLgZ+32sdSAAAVwUlEQVQW3PeJwN8AT0p6PJf9M3AZMFPSeaQLAc4CiIh5kmYCT5OuHLswX/kF\ncAFwHTCA1EE/O5dfC9yYO/XXka4e61IHjR7M719YS2NjUOXn1ZtZH9fWJcUnAq8HTgU+R6o5HCTp\nMuC3ETG7jW2JiN8Brf3KntzKNpcCl7ZQPhc4soXybUDh2lMlHLjPILbtbGTlpm27HjNsZtZXtdX8\n9QPgWGBjRHw0It5MqlnMzuUGTByZngL57EvuVzEza6um8u/Am4H9JD1AekDXGGAk8P0uiK1XmDRq\nEAAv+tHCZmZtDij5XERcC7wYEScC7ybdB3IwcE0XxdfjTRg5gCH9anh2xcb2VzYz28sVuaT4HwAi\nYivwbET8V0ScVtmweg9JHDluGM+4+cvMrP2kkjvcS9NOJi2YOHIgi9duoYuvZjYz63E8CmInOHL8\nMDZs3cmSte5XMbO+zUmlE5xw0EgA7nl2VTtrmpnt3ZxUOsEh+w7h8P2GMPORpW4CM7M+ra1LigGQ\n9NctFL8MPBkR/tM8++BxE/nX2+cxf+UmDt9vaHeHY2bWLYrUVM4jXUL8ofz6b9LzTx6Q9DcVjK1X\neceUNIL//c913tD6Zma9TZGkUgO8LiLeHxHvB6aQhqs/jpRcDBg7bACHjhnMnfNWdncoZmbdpkhS\nmRAR5b+Uq3LZOmBnZcLqnc6eNpFHl6znwYVrujsUM7NuUSSp3CfpV5JmSJpBGm7+PkmDgA2VDa93\n+eBxExlQW80dT73U/spmZnuhIknlQuCHwDH5dT1pWPotEfHWSgbX2/Srqebk1+3LrY8uY8v2+u4O\nx8ysyxW5oz6A3wH3AvcA93f1g7B6k/cfO55tOxvdYW9mfVKRJz+eBTxMerLiWcAcSWdUOrDe6i2T\nRzNu+AB++MDi7g7FzKzLFWn++hfgzyJiRkScA0wDvljZsHqv6irxkRMn8fDidTzgDnsz62OKJJWq\nZjc5ri24XZ/1oeMOYMTAWq64Z0F3h2Jm1qWKJIdfS7pT0rmSzgX+B7ijsmH1bgPqqvnE2yYzZ9E6\n5i5e193hmJl1mSId9Z8FrgaOyq+rI8I3Pbbj7GkTGDmojst//Wx3h2Jm1mUKNWNFxM8i4tP5dVuR\nbST9QNIqSU+VlY2UdJekBfl9RNmyiyUtlDRf0jvLyo+V9GRedoUk5fJ+km7J5XMkTSr6obvCwLoa\nzp42gUcWr2fBSj/Ay8z6hlaTiqRNkja28Nokqcizc68DTmlWdhFwT0RMJl2efFE+1hRgOnBE3uZ7\nkqrzNlcCHwMm51dpn+cB6yPiEOAbwOUFYupSHznxQKqrxI8eWtLdoZiZdYm2nlE/JCKGtvAaEhHt\nDsMbEfcDzTsUTiPdPEl+P72s/OaI2B4Ri4CFwDRJY4GhEfFQvjfmhmbblPZ1K3ByqRbTU+wzuB+n\nHzOOm+a86NqKmfUJXX0V15iIKI1hsgIYk6fHAUvL1luWy8bl6eblu20TEfWk4fhHVSbsPfe5Uw5j\nQG01X73jme4Oxcys4rrt0uBc8+iSO/MlnS9prqS5q1d37Z3uY4b25xNvO4TfzF/N759f26XHNjPr\nal2dVFbmJi3ye+n+l+XAhLL1xuey5Xm6eflu20iqAYaR7qF5lYi4OiKmRsTU0aNHd9JHKW7Gn09i\n7LD+fPWOZ9jZ0Njlxzcz6ypdnVRmATPy9Azg9rLy6fmKrgNJHfIP56ayjZKOz/0l5zTbprSvM4B7\ne+qYZP1rq7noXYfz5PKX+fIv53V3OGZmFdPu44T3lKSfACcB+0haBnwJuAyYKek8YAlpLDEiYp6k\nmcDTQD1pFOSGvKsLSFeSDQBm5xfAtcCNkhaSLgiYXqnP0hlOO2YcT/9pI9+//wWqJC55zxFUVfWo\n6wrMzF4z9dA/7itm6tSpMXfu3G45dn1DI1+aNY+b5rzIu47cj6+ddTQD6yqW183MOo2kRyNianvr\neQyvLlRTXcVXTj+SL/zV67hz3gpm/OBhNm7zwzPNbO/hpNLFJPG3bz6IK85+A4+9uIGzr36I1Zu2\nd3dYZmadwkmlm7z7qP25ZsZUnl+9mTOvepBl67d2d0hmZq+Zk0o3Oumwfbnpb49nzeYdfPS6R3jp\n5Ve6OyQzs9fESaWbHXvACL77oTeydN0rvO+7D7J0nWssZtZ7Oan0AH9x6Ghu+bvjWb91B2d9//c8\n4mewmFkv5aTSQxw1fjg/Of94qqvE2Vc/xI/nvEhfu9zbzHo/J5Ue5I0TR3DHJ9/MiYfswz/f9iTv\nv/JBfjN/Vfsbmpn1EE4qPczQ/rVcM2Mql7xnCqs2becjP3yED18zh98/v9Y1FzPr8XxHfQ+2o76R\n6x9czPfvf4E1m7dz9ITh/P1fHMxfThnjIV7MrEsVvaPeSaUX2LazgZ/9YRnf/+0LvLhuKweNHsTf\nveUg3veG8dTVuLJpZpXnpNKK3phUSuobGpn91Aqu+u3zzPvTRiaMHMCHjzuA9x6zP2OHDeju8Mxs\nL+ak0orenFRKIoJ7n13FVb99nkcWrwfgjROHc+rrx3Lq68ey/3AnGDPrXE4qrdgbkkq5F1ZvZvZT\nK/ifJ17i6Zc2AnD0+GGc/LoxvO3wfTli/6GkR9GYme05J5VW7G1JpdyiNVu448mXuPuZlTy+dAMR\nsN/Q/rx9yr68efJo/mzSSEYOquvuMM2sF3JSacXenFTKrdm8nfvmr+bup1dy33Or2LYzPcZ48r6D\nmXbgSN44cQRHjR/GIfsOdk3GzNrlpNKKvpJUym2vb+DJZS8zZ9E6Hlm8jrmL17N5ez0AwwfWcsyE\n4Ryx/1Am7zuEyWMGc/DowfSvre7mqM2sJymaVPzYwT6gX001UyeNZOqkkUC6imzx2i08umQ9j724\ngcde3MDvFqyhvjH9gVElOGDUICbvO5hDx6REc+iYIRw0ehD9apxszKx1Tip9UE11FYfsO4RD9h3C\nB/5sIpButFy0ZgvPrdzEgpWbWLBqM8+t3MQ9z66iISeb6ioxYcQAJo4axMSRAxg/YiCjB/dj5OA6\n9hnUj1GD6xg5qM61HLM+zEnFAKirqeKw/YZw2H5DdivfXt+Qk81mFqzcxAurt/Diuq08/uJ6Nm6r\nb3Ffg/vV7Eowowb1Y9SgOkYNrmPU4LLpsiRUW+0bOM32Fk4q1qZ+NdUcvt9QDt9v6KuWbd5ez5pN\n21m7ZTtrN+9g7ZYdrNuygzWb0/y6LTtYtn4rf1y2gXVbduyq8TQ3bEDtbslm5OA6RgysZVC/Gob0\nr2XYgPQant8H9athQF01A2qrqfZwNWY9Sq9PKpJOAb4FVAPXRMRl3RxSnzG4Xw2D+9UwaZ9B7a7b\n2Bhs3LaTtVt2pAS0efuu6XVbtrNmSyp7Yc1mHlm8gw2v7Gw1CZUbWFe9K45SohlQV02/mmr61VbR\nr6YqTddU0b+2tLwqv9fQv7aK/nnd2uoq6mqqqGv2XludpmtrRE1VFbXV8hVzZq3o1UlFUjXwXeAd\nwDLgEUmzIuLp7o3MmquqEsMH1jF8YB0Hj25//Yhg285GNm7bycZXdvJy2WvLjgZe2VHP1h0NbNle\nz+bt9WzaVs+2nQ1s3dGQalCbd7B9ZwPb6xvZ0dDIttJ0fWOnfJ6aKlFTLWqrqqitqaKmStRWV1Gd\ny2uqRHVV1a710nzplcvzsuqqKmrzsprqql3rViktr5KoroJqpXWrq9L5TPMpwVWLpukqUSXydtrt\nvUq8ah3l9/LlVUrHaJpvWh/y+lUgdt9Gzdcr7YN0HAE0my/FoLRgV7mk/N60Ps3mX7Wek32369VJ\nBZgGLIyIFwAk3QycBjip9HKSUs2jrpoxQ/t32n4bGoNXdjbwyo702lbfsCvh7KxvZHtDet/RkBLQ\nzoZGdjQEO/N0egU7Gxqpb8zveX5nQ1Df2EhDY9DQGOxsCBoa03oNjbFrvYbGBuob03a7ljU2li1P\nZY0B9Y2NNDZCQ0ShmpsluxIYTUmnqbyUnZrediWsXNja9rtv0/K6zfdJi/vc/VjN97lbeVmebGmb\n5sfdfWL3dT558mTec/T+VFJvTyrjgKVl88uA45qvJOl84HyAiRMndk1k1iNVV2lXc1lv1JRwmt5L\nSSdNp2TUsGs6Ja7GxiBg13YRaboxv0dpOm9fmi/tl1et37ROUL5Nad/s2q6htD6ldyBSPFG+P9j1\nzKDm65fPk9draVn5vsv30zTdVA552a7p3fddKttt3bL5Xeu2sM941T6b5tK5e/U+W4qZ8mOVHfPV\n+3718t02zxPDBtRSab3zf1YHRcTVwNWQbn7s5nDM9lipCc2sp+rt13IuByaUzY/PZWZm1g16e1J5\nBJgs6UBJdcB0YFY3x2Rm1mf16uaviKiX9AngTtIlxT+IiHndHJaZWZ/Vq5MKQETcAdzR3XGYmVnv\nb/4yM7MexEnFzMw6jZOKmZl1GicVMzPrNH3uyY+SVgNL9nDzfYA1nRhOb+XzkPg8JD4PfeMcHBAR\n7Y7c1+eSymshaW6Rx2nu7XweEp+HxOfB56Ccm7/MzKzTOKmYmVmncVLpmKu7O4Aewuch8XlIfB58\nDnZxn4qZmXUa11TMzKzTOKmYmVmncVIpSNIpkuZLWijpou6OpzNJmiDpN5KeljRP0idz+UhJd0la\nkN9HlG1zcT4X8yW9s6z8WElP5mVXqJc9NFxStaTHJP0qz/e5cwAgabikWyU9K+kZSSf0tXMh6f/l\n/w9PSfqJpP597RzskciPEvWr9RdpWP3ngYOAOuCPwJTujqsTP99Y4I15egjwHDAF+A/golx+EXB5\nnp6Sz0E/4MB8bqrzsoeB40mPxp4NvKu7P18Hz8WngR8Dv8rzfe4c5M9wPfC3eboOGN6XzgXpUeWL\ngAF5fiZwbl86B3v6ck2lmGnAwoh4ISJ2ADcDp3VzTJ0mIl6KiD/k6U3AM6T/VKeRflzI76fn6dOA\nmyNie0QsAhYC0ySNBYZGxEOR/jfdULZNjydpPPBXwDVlxX3qHABIGga8BbgWICJ2RMQG+t65qAEG\nSKoBBgJ/ou+dgw5zUilmHLC0bH5ZLtvrSJoEvAGYA4yJiJfyohXAmDzd2vkYl6ebl/cW3wQ+BzSW\nlfW1cwDpL+3VwA9zU+A1kgbRh85FRCwH/gt4EXgJeDki/pc+dA72lJOK7SJpMPAz4FMRsbF8Wf4r\na6+9/lzSu4FVEfFoa+vs7eegTA3wRuDKiHgDsIXU1LPL3n4ucl/JaaQEuz8wSNKHy9fZ28/BnnJS\nKWY5MKFsfnwu22tIqiUllJsi4ue5eGWuvpPfV+Xy1s7H8jzdvLw3OBF4r6TFpObNt0n6EX3rHJQs\nA5ZFxJw8fyspyfSlc/F2YFFErI6IncDPgT+nb52DPeKkUswjwGRJB0qqA6YDs7o5pk6Tr0a5Fngm\nIr5etmgWMCNPzwBuLyufLqmfpAOBycDDuVlgo6Tj8z7PKdumR4uIiyNifERMIn2/90bEh+lD56Ak\nIlYASyUdlotOBp6mb52LF4HjJQ3MsZ9M6mvsS+dgz3T3lQK95QWcSroq6nngX7o7nk7+bG8iVeOf\nAB7Pr1OBUcA9wALgbmBk2Tb/ks/FfMquZgGmAk/lZd8hj9rQm17ASTRd/dVXz8ExwNz8b+IXwIi+\ndi6ALwPP5vhvJF3Z1afOwZ68PEyLmZl1Gjd/mZlZp3FSMTOzTuOkYmZmncZJxczMOo2TipmZdRon\nFdtrSfq4pHM6sP571cERqCVdJ+mMjkfXtfvsTJLOlfSdDm4zVdIVefokSX9emeisu9V0dwBmlRIR\nV3Vw/VnsRTe19hSSaiJiLum+F0j3AW0GHuy2oKxiXFOxbidpUn5ux3WSnpN0k6S3S3ogP7diWl5v\npKRfSHpC0kOSjpJUJWmxpOFl+1sgaYykSyR9JpcdLOnXkh6V9H+SDm8hjl1/gedYrpD0oKQXSjUH\nJd/Jz8y4G9i3bPtjJf02H+NOSWMl1Uh6RNJJeZ1/l3RpgdPydklz8/l4d972fknHlB3vd5KObuEz\n3C7pvnwevlS27NNKzwZ5StKnmp37m5Sem3KrpIF52WJJ++TpqZLua+GcvUfSHKWBJ++WNCaXXyLp\nRkkPADfm2smvlAYs/Tjw/yQ9LunNkhYpDROEpKHl89b7OKlYT3EI8DXg8Pz6IOlO/88A/5zX+TLw\nWEQclctuiIhG0rAX7wOQdBywJCJWNtv/1cA/RMSxeZ/fKxDT2BzDu4HLctn7gMNIz884hzQeVGns\ntG8DZ+Rj/AC4NCLqSc/huFLS24FT8udozyTSIxf+CrhKUn/SUDrn5uMdCvSPiD+2sO004P3AUcCZ\nOSEcC3wEOI70bI+PSXpDXv8w4HsR8TpgI3BBgfhKfgccH2ngyZtJozyXTAHeHhFnlwoiYjFwFfCN\niDgmIv4PuC9/TkhD5Pw80nhb1gs5qVhPsSginsxJYh5wT6ThHp4k/cBC+oG/ESAi7gVGSRoK3AJ8\nIK8zPc/vojT68p8DP5X0OPB9UsJozy8iojEinqZpiPO3AD+JiIaI+BNwby4/DDgSuCsf4wvkgQQj\nYl6O+1fARyM9k6c9M/OxFwAvkBLtT4F35wT2UeC6Vra9KyLWRsQrpIEQ35Rft0XElojYnMvfnNdf\nGhEP5Okf5XWLGg/cKelJ4LPAEWXLZuUY2nMNKeGR33/YgeNbD+M+FesptpdNN5bNN9L+v9PfA4dI\nGk16ANJXmi2vAjZExDGv2rJ4TO09AlbAvIg4oZXlrwc2UNZc1o7m4ydFRGyVdBdpSPazgGOLbtvR\nY+X3epr+8OzfyrbfBr4eEbNyE98lZcu2tHPcdLCIB3Iz3EmkpyU+VWQ765lcU7He5P+AD0G6gghY\nExEbc43mNuDrpJGW15ZvFOnZMIsknZm3VfO+iA64H/iA0rPsxwJvzeXzgdGSTsjHqJV0RJ7+a2Ak\nqZbz7VL/T+5feV8rxzkz9xcdTHqM9fxcfg1wBfBIRKxvZdt35P6nAaQk+wDp3J2uNOruIFIz3v/l\n9SeW4iY1O/4uTy+mKXG9v5VjDaNpKPcZrazT3CbSY6vL3UB6jLNrKb2ck4r1JpcAx0p6gtTHUf4j\ndgvwYZo1fZX5EHCepD+Smtf29HHQt5FGqH2a9EP4e0iP3AXOAC7Px3gc+PPc0X0Z6Xnvz5FGqf1W\n3tfrSU8PbMmLpGebzwY+HhHb8nEeJfV7tPXj+zDp2ThPAD+LiLmRHhd9XV42B7gmIh7L688HLpT0\nDGk04itz+ZeBb0maCzS0cqxLSM2KjwJr2oip3C+B95U66nPZTfnYPym4D+uhPEqxWTeRdGdEvLOD\n2+xP6tg+PPc/NV9+LjA1Ij5RcH+TSMP8H9mRODpbvrrutIj4m+6Mw14796mYdZM9SCjnAJcCn24p\nofRWkr4NvIv0DB/r5VxTMTOzTuM+FTMz6zROKmZm1mmcVMzMrNM4qZiZWadxUjEzs07z/wMRYSeM\n7YnHIwAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "counts_desc = cat_titles.value_counts().values\n", "assert sorted(counts_desc, reverse=True)\n", "\n", "plt.plot(counts_desc[1:]) # 0-th is too large\n", "plt.xlabel(\"movie index, by popularity\")\n", "plt.ylabel(\"log # times movie appears\")\n", "plt.title(\"movie apperance count among ratings\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([3204, 5575, 7918, 9259])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "cdf = counts_desc.cumsum() / counts_desc.sum()\n", "np.searchsorted(cdf, [.95, .99, .999, 1])" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "from collections import Counter\n", "from scipy.stats import truncnorm\n", "%load_ext memory_profiler\n", "d = 10000\n", "e = 1000\n", "n = 100000000\n", "if d < n:\n", " dindices = np.random.geometric(p=0.01, size=(n - d)) - 1\n", " dindices = np.concatenate([dindices, np.arange(d)])\n", " dcounts = np.bincount(dindices)\n", " selected = dcounts.argsort()[::-1][:e]\n", "else:\n", " dindices = np.random.choice(d, n // 2)\n", " frequent = np.random.choice(n, n - n // 2)\n", " dindices = np.concatenate([dindices, frequent])\n", " c = Counter(dindices)\n", " selected = np.asarray(sorted(c, key=c.get, reverse=True)[:e])" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "peak memory: 2401.04 MiB, increment: 1546.69 MiB\n" ] } ], "source": [ "%%memit\n", "\n", "searched = np.searchsorted(selected, dindices)\n", "selected2 = np.append(selected, [-1])\n", "searched[selected2[searched] != dindices] = -1\n", "searched[searched == -1] = e\n", "result = searched" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5.07 s ± 55.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%%timeit\n", "\n", "searched = np.searchsorted(selected, dindices)\n", "selected2 = np.append(selected, [-1])\n", "searched[selected2[searched] != dindices] = -1\n", "searched[searched == -1] = e\n", "result = searched" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "peak memory: 1501.95 MiB, increment: 647.29 MiB\n" ] } ], "source": [ "%%memit\n", "mapping = np.full(d, e)\n", "mapping[selected] = np.arange(e)\n", "dindices = np.take(mapping, dindices)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "330 ms ± 6.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%%timeit\n", "mapping = np.full(d, e)\n", "mapping[selected] = np.arange(e)\n", "dindices = np.take(mapping, dindices)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }