{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91492c04",
   "metadata": {},
   "outputs": [],
   "source": [
    "# How to load a Python script from book into Jupyter Notebook:\n",
    "#\n",
    "#            %load book/scratch/introduction.py\n",
    "\n",
    "# Use  control-shift-minus to split cells and try things out !!!\n",
    "#\n",
    "\n",
    "\"\"\"\n",
    "This is code for the introduction chapter. As such, it stands alone\n",
    "and won't be used anywhere else in the book.\n",
    "\"\"\"\n",
    "# type: ignore\n",
    "\n",
    "users = [\n",
    "    { \"id\": 0, \"name\": \"Hero\" },\n",
    "    { \"id\": 1, \"name\": \"Dunn\" },\n",
    "    { \"id\": 2, \"name\": \"Sue\" },\n",
    "    { \"id\": 3, \"name\": \"Chi\" },\n",
    "    { \"id\": 4, \"name\": \"Thor\" },\n",
    "    { \"id\": 5, \"name\": \"Clive\" },\n",
    "    { \"id\": 6, \"name\": \"Hicks\" },\n",
    "    { \"id\": 7, \"name\": \"Devin\" },\n",
    "    { \"id\": 8, \"name\": \"Kate\" },\n",
    "    { \"id\": 9, \"name\": \"Klein\" }\n",
    "]\n",
    "\n",
    "# Use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8130362e",
   "metadata": {},
   "outputs": [],
   "source": [
    "friendship_pairs = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),\n",
    "                    (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]\n",
    "\n",
    "# Initialize the dict with an empty list for each user id:\n",
    "friendships = {user[\"id\"]: [] for user in users}\n",
    "\n",
    "# And loop over the friendship pairs to populate it:\n",
    "for i, j in friendship_pairs:\n",
    "    friendships[i].append(j)  # Add j as a friend of user i\n",
    "    friendships[j].append(i)  # Add i as a friend of user j\n",
    "\n",
    "def number_of_friends(user):\n",
    "    \"\"\"How many friends does _user_ have?\"\"\"\n",
    "    user_id = user[\"id\"]\n",
    "    friend_ids = friendships[user_id]\n",
    "    return len(friend_ids)\n",
    "\n",
    "total_connections = sum(number_of_friends(user)\n",
    "                        for user in users)        # 24\n",
    "\n",
    "\n",
    "assert total_connections == 24\n",
    "\n",
    "num_users = len(users)                            # length of the users list\n",
    "avg_connections = total_connections / num_users   # 24 / 10 == 2.4\n",
    "\n",
    "\n",
    "assert num_users == 10\n",
    "assert avg_connections == 2.4\n",
    "\n",
    "# Create a list (user_id, number_of_friends).\n",
    "num_friends_by_id = [(user[\"id\"], number_of_friends(user))\n",
    "                     for user in users]\n",
    "\n",
    "num_friends_by_id.sort(                                # Sort the list\n",
    "       key=lambda id_and_friends: id_and_friends[1],   # by num_friends\n",
    "       reverse=True)                                   # largest to smallest\n",
    "\n",
    "# Each pair is (user_id, num_friends):\n",
    "# [(1, 3), (2, 3), (3, 3), (5, 3), (8, 3),\n",
    "#  (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]\n",
    "\n",
    "\n",
    "assert num_friends_by_id[0][1] == 3     # several people have 3 friends\n",
    "assert num_friends_by_id[-1] == (9, 1)  # user 9 has only 1 friend\n",
    "\n",
    "def foaf_ids_bad(user):\n",
    "    \"\"\"foaf is short for \"friend of a friend\" \"\"\"\n",
    "    return [foaf_id\n",
    "            for friend_id in friendships[user[\"id\"]]\n",
    "            for foaf_id in friendships[friend_id]]\n",
    "\n",
    "[0, 2, 3, 0, 1, 3]\n",
    "\n",
    "\n",
    "assert foaf_ids_bad(users[0]) == [0, 2, 3, 0, 1, 3]\n",
    "\n",
    "print(friendships[0])  # [1, 2]\n",
    "print(friendships[1])  # [0, 2, 3]\n",
    "print(friendships[2])  # [0, 1, 3]\n",
    "\n",
    "\n",
    "assert friendships[0] == [1, 2]\n",
    "assert friendships[1] == [0, 2, 3]\n",
    "assert friendships[2] == [0, 1, 3]\n",
    "\n",
    "from collections import Counter                   # not loaded by default\n",
    "\n",
    "def friends_of_friends(user):\n",
    "    user_id = user[\"id\"]\n",
    "    return Counter(\n",
    "        foaf_id\n",
    "        for friend_id in friendships[user_id]     # For each of my friends,\n",
    "        for foaf_id in friendships[friend_id]     # find their friends\n",
    "        if foaf_id != user_id                     # who aren't me\n",
    "        and foaf_id not in friendships[user_id]   # and aren't my friends.\n",
    "    )\n",
    "\n",
    "\n",
    "print(friends_of_friends(users[3]))               # Counter({0: 2, 5: 1})\n",
    "\n",
    "\n",
    "assert friends_of_friends(users[3]) == Counter({0: 2, 5: 1})\n",
    "\n",
    "interests = [\n",
    "    (0, \"Hadoop\"), (0, \"Big Data\"), (0, \"HBase\"), (0, \"Java\"),\n",
    "    (0, \"Spark\"), (0, \"Storm\"), (0, \"Cassandra\"),\n",
    "    (1, \"NoSQL\"), (1, \"MongoDB\"), (1, \"Cassandra\"), (1, \"HBase\"),\n",
    "    (1, \"Postgres\"), (2, \"Python\"), (2, \"scikit-learn\"), (2, \"scipy\"),\n",
    "    (2, \"numpy\"), (2, \"statsmodels\"), (2, \"pandas\"), (3, \"R\"), (3, \"Python\"),\n",
    "    (3, \"statistics\"), (3, \"regression\"), (3, \"probability\"),\n",
    "    (4, \"machine learning\"), (4, \"regression\"), (4, \"decision trees\"),\n",
    "    (4, \"libsvm\"), (5, \"Python\"), (5, \"R\"), (5, \"Java\"), (5, \"C++\"),\n",
    "    (5, \"Haskell\"), (5, \"programming languages\"), (6, \"statistics\"),\n",
    "    (6, \"probability\"), (6, \"mathematics\"), (6, \"theory\"),\n",
    "    (7, \"machine learning\"), (7, \"scikit-learn\"), (7, \"Mahout\"),\n",
    "    (7, \"neural networks\"), (8, \"neural networks\"), (8, \"deep learning\"),\n",
    "    (8, \"Big Data\"), (8, \"artificial intelligence\"), (9, \"Hadoop\"),\n",
    "    (9, \"Java\"), (9, \"MapReduce\"), (9, \"Big Data\")\n",
    "]\n",
    "\n",
    "def data_scientists_who_like(target_interest):\n",
    "    \"\"\"Find the ids of all users who like the target interest.\"\"\"\n",
    "    return [user_id\n",
    "            for user_id, user_interest in interests\n",
    "            if user_interest == target_interest]\n",
    "\n",
    "from collections import defaultdict\n",
    "\n",
    "# Keys are interests, values are lists of user_ids with that interest\n",
    "user_ids_by_interest = defaultdict(list)\n",
    "\n",
    "for user_id, interest in interests:\n",
    "    user_ids_by_interest[interest].append(user_id)\n",
    "\n",
    "# Keys are user_ids, values are lists of interests for that user_id.\n",
    "interests_by_user_id = defaultdict(list)\n",
    "\n",
    "for user_id, interest in interests:\n",
    "    interests_by_user_id[user_id].append(interest)\n",
    "\n",
    "def most_common_interests_with(user):\n",
    "    return Counter(\n",
    "        interested_user_id\n",
    "        for interest in interests_by_user_id[user[\"id\"]]\n",
    "        for interested_user_id in user_ids_by_interest[interest]\n",
    "        if interested_user_id != user[\"id\"]\n",
    "    )\n",
    "\n",
    "salaries_and_tenures = [(83000, 8.7), (88000, 8.1),\n",
    "                        (48000, 0.7), (76000, 6),\n",
    "                        (69000, 6.5), (76000, 7.5),\n",
    "                        (60000, 2.5), (83000, 10),\n",
    "                        (48000, 1.9), (63000, 4.2)]\n",
    "\n",
    "# Keys are years, values are lists of the salaries for each tenure.\n",
    "salary_by_tenure = defaultdict(list)\n",
    "\n",
    "for salary, tenure in salaries_and_tenures:\n",
    "    salary_by_tenure[tenure].append(salary)\n",
    "\n",
    "# Keys are years, each value is average salary for that tenure.\n",
    "average_salary_by_tenure = {\n",
    "    tenure: sum(salaries) / len(salaries)\n",
    "    for tenure, salaries in salary_by_tenure.items()\n",
    "}\n",
    "\n",
    "\n",
    "assert average_salary_by_tenure == {\n",
    "    0.7: 48000.0,\n",
    "    1.9: 48000.0,\n",
    "    2.5: 60000.0,\n",
    "    4.2: 63000.0,\n",
    "    6: 76000.0,\n",
    "    6.5: 69000.0,\n",
    "    7.5: 76000.0,\n",
    "    8.1: 88000.0,\n",
    "    8.7: 83000.0,\n",
    "    10: 83000.0\n",
    "}\n",
    "\n",
    "{0.7: 48000.0,\n",
    " 1.9: 48000.0,\n",
    " 2.5: 60000.0,\n",
    " 4.2: 63000.0,\n",
    " 6: 76000.0,\n",
    " 6.5: 69000.0,\n",
    " 7.5: 76000.0,\n",
    " 8.1: 88000.0,\n",
    " 8.7: 83000.0,\n",
    " 10: 83000.0}\n",
    "\n",
    "def tenure_bucket(tenure):\n",
    "    if tenure < 2:\n",
    "        return \"less than two\"\n",
    "    elif tenure < 5:\n",
    "        return \"between two and five\"\n",
    "    else:\n",
    "        return \"more than five\"\n",
    "\n",
    "# Keys are tenure buckets, values are lists of salaries for that bucket.\n",
    "salary_by_tenure_bucket = defaultdict(list)\n",
    "\n",
    "for salary, tenure in salaries_and_tenures:\n",
    "    bucket = tenure_bucket(tenure)\n",
    "    salary_by_tenure_bucket[bucket].append(salary)\n",
    "\n",
    "# Keys are tenure buckets, values are average salary for that bucket\n",
    "average_salary_by_bucket = {\n",
    "  tenure_bucket: sum(salaries) / len(salaries)\n",
    "  for tenure_bucket, salaries in salary_by_tenure_bucket.items()\n",
    "}\n",
    "\n",
    "{'between two and five': 61500.0,\n",
    " 'less than two': 48000.0,\n",
    " 'more than five': 79166.66666666667}\n",
    "\n",
    "\n",
    "assert average_salary_by_bucket == {\n",
    "    'between two and five': 61500.0,\n",
    "    'less than two': 48000.0,\n",
    "    'more than five': 79166.66666666667\n",
    "}\n",
    "\n",
    "def predict_paid_or_unpaid(years_experience):\n",
    "  if years_experience < 3.0:\n",
    "    return \"paid\"\n",
    "  elif years_experience < 8.5:\n",
    "    return \"unpaid\"\n",
    "  else:\n",
    "    return \"paid\"\n",
    "\n",
    "interests = [\n",
    "    (0, \"Hadoop\"), (0, \"Big Data\"), (0, \"HBase\"), (0, \"Java\"),\n",
    "    (0, \"Spark\"), (0, \"Storm\"), (0, \"Cassandra\"),\n",
    "    (1, \"NoSQL\"), (1, \"MongoDB\"), (1, \"Cassandra\"), (1, \"HBase\"),\n",
    "    (1, \"Postgres\"), (2, \"Python\"), (2, \"scikit-learn\"), (2, \"scipy\"),\n",
    "    (2, \"numpy\"), (2, \"statsmodels\"), (2, \"pandas\"), (3, \"R\"), (3, \"Python\"),\n",
    "    (3, \"statistics\"), (3, \"regression\"), (3, \"probability\"),\n",
    "    (4, \"machine learning\"), (4, \"regression\"), (4, \"decision trees\"),\n",
    "    (4, \"libsvm\"), (5, \"Python\"), (5, \"R\"), (5, \"Java\"), (5, \"C++\"),\n",
    "    (5, \"Haskell\"), (5, \"programming languages\"), (6, \"statistics\"),\n",
    "    (6, \"probability\"), (6, \"mathematics\"), (6, \"theory\"),\n",
    "    (7, \"machine learning\"), (7, \"scikit-learn\"), (7, \"Mahout\"),\n",
    "    (7, \"neural networks\"), (8, \"neural networks\"), (8, \"deep learning\"),\n",
    "    (8, \"Big Data\"), (8, \"artificial intelligence\"), (9, \"Hadoop\"),\n",
    "    (9, \"Java\"), (9, \"MapReduce\"), (9, \"Big Data\")\n",
    "]\n",
    "\n",
    "words_and_counts = Counter(word\n",
    "                           for user, interest in interests\n",
    "                           for word in interest.lower().split())\n",
    "\n",
    "for word, count in words_and_counts.most_common():\n",
    "    if count > 1:\n",
    "        print(word, count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b5e5f0bf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/json": {
       "cell": {
        "!": "OSMagics",
        "HTML": "Other",
        "SVG": "Other",
        "bash": "Other",
        "capture": "ExecutionMagics",
        "debug": "ExecutionMagics",
        "file": "Other",
        "html": "DisplayMagics",
        "javascript": "DisplayMagics",
        "js": "DisplayMagics",
        "latex": "DisplayMagics",
        "markdown": "DisplayMagics",
        "perl": "Other",
        "prun": "ExecutionMagics",
        "pypy": "Other",
        "python": "Other",
        "python2": "Other",
        "python3": "Other",
        "ruby": "Other",
        "script": "ScriptMagics",
        "sh": "Other",
        "svg": "DisplayMagics",
        "sx": "OSMagics",
        "system": "OSMagics",
        "time": "ExecutionMagics",
        "timeit": "ExecutionMagics",
        "writefile": "OSMagics"
       },
       "line": {
        "alias": "OSMagics",
        "alias_magic": "BasicMagics",
        "autoawait": "AsyncMagics",
        "autocall": "AutoMagics",
        "automagic": "AutoMagics",
        "autosave": "KernelMagics",
        "bookmark": "OSMagics",
        "cat": "Other",
        "cd": "OSMagics",
        "clear": "KernelMagics",
        "colors": "BasicMagics",
        "conda": "PackagingMagics",
        "config": "ConfigMagics",
        "connect_info": "KernelMagics",
        "cp": "Other",
        "debug": "ExecutionMagics",
        "dhist": "OSMagics",
        "dirs": "OSMagics",
        "doctest_mode": "BasicMagics",
        "ed": "Other",
        "edit": "KernelMagics",
        "env": "OSMagics",
        "gui": "BasicMagics",
        "hist": "Other",
        "history": "HistoryMagics",
        "killbgscripts": "ScriptMagics",
        "ldir": "Other",
        "less": "KernelMagics",
        "lf": "Other",
        "lk": "Other",
        "ll": "Other",
        "load": "CodeMagics",
        "load_ext": "ExtensionMagics",
        "loadpy": "CodeMagics",
        "logoff": "LoggingMagics",
        "logon": "LoggingMagics",
        "logstart": "LoggingMagics",
        "logstate": "LoggingMagics",
        "logstop": "LoggingMagics",
        "ls": "Other",
        "lsmagic": "BasicMagics",
        "lx": "Other",
        "macro": "ExecutionMagics",
        "magic": "BasicMagics",
        "man": "KernelMagics",
        "matplotlib": "PylabMagics",
        "mkdir": "Other",
        "more": "KernelMagics",
        "mv": "Other",
        "notebook": "BasicMagics",
        "page": "BasicMagics",
        "pastebin": "CodeMagics",
        "pdb": "ExecutionMagics",
        "pdef": "NamespaceMagics",
        "pdoc": "NamespaceMagics",
        "pfile": "NamespaceMagics",
        "pinfo": "NamespaceMagics",
        "pinfo2": "NamespaceMagics",
        "pip": "PackagingMagics",
        "popd": "OSMagics",
        "pprint": "BasicMagics",
        "precision": "BasicMagics",
        "prun": "ExecutionMagics",
        "psearch": "NamespaceMagics",
        "psource": "NamespaceMagics",
        "pushd": "OSMagics",
        "pwd": "OSMagics",
        "pycat": "OSMagics",
        "pylab": "PylabMagics",
        "qtconsole": "KernelMagics",
        "quickref": "BasicMagics",
        "recall": "HistoryMagics",
        "rehashx": "OSMagics",
        "reload_ext": "ExtensionMagics",
        "rep": "Other",
        "rerun": "HistoryMagics",
        "reset": "NamespaceMagics",
        "reset_selective": "NamespaceMagics",
        "rm": "Other",
        "rmdir": "Other",
        "run": "ExecutionMagics",
        "save": "CodeMagics",
        "sc": "OSMagics",
        "set_env": "OSMagics",
        "store": "StoreMagics",
        "sx": "OSMagics",
        "system": "OSMagics",
        "tb": "ExecutionMagics",
        "time": "ExecutionMagics",
        "timeit": "ExecutionMagics",
        "unalias": "OSMagics",
        "unload_ext": "ExtensionMagics",
        "who": "NamespaceMagics",
        "who_ls": "NamespaceMagics",
        "whos": "NamespaceMagics",
        "xdel": "NamespaceMagics",
        "xmode": "BasicMagics"
       }
      },
      "text/plain": [
       "Available line magics:\n",
       "%alias  %alias_magic  %autoawait  %autocall  %automagic  %autosave  %bookmark  %cat  %cd  %clear  %colors  %conda  %config  %connect_info  %cp  %debug  %dhist  %dirs  %doctest_mode  %ed  %edit  %env  %gui  %hist  %history  %killbgscripts  %ldir  %less  %lf  %lk  %ll  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %ls  %lsmagic  %lx  %macro  %magic  %man  %matplotlib  %mkdir  %more  %mv  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %pip  %popd  %pprint  %precision  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %rep  %rerun  %reset  %reset_selective  %rm  %rmdir  %run  %save  %sc  %set_env  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode\n",
       "\n",
       "Available cell magics:\n",
       "%%!  %%HTML  %%SVG  %%bash  %%capture  %%debug  %%file  %%html  %%javascript  %%js  %%latex  %%markdown  %%perl  %%prun  %%pypy  %%python  %%python2  %%python3  %%ruby  %%script  %%sh  %%svg  %%sx  %%system  %%time  %%timeit  %%writefile\n",
       "\n",
       "Automagic is ON, % prefix IS NOT needed for line magics."
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%lsmagic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ef9f9508",
   "metadata": {},
   "outputs": [],
   "source": [
    "magic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42f2f6d5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}