Commit 02917bbf authored by wx002's avatar wx002

added reduced data

parent f7b093f6
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from urllib.request import urlopen\n",
"import json\n",
"import re\n",
"import pandas as pd\n",
"import numpy as np\n",
"import datetime"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"Brightkite_data = pd.read_csv('Datasets/Brightkite_light.txt')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Unnamed: 0', 'UserID', 'PlaceID', 'Coordinates', 'Date', 'Time'], dtype='object')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Brightkite_data.columns"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"Brightkite_data.drop(columns=['Unnamed: 0'], inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### We don't need places with only one visit per day"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"multi_visit_data = Brightkite_data[(Brightkite_data.duplicated(['Coordinates','Date'], keep = 'first') | Brightkite_data.duplicated(['Coordinates','Date'], keep = 'last'))]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### What Are the Most Visited Places"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"place_groups = multi_visit_data.groupby(['Coordinates'], group_keys=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Coordinates</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Coordinates</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>(33.748995, -84.387982)</th>\n",
" <td>1614</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(29.763284000000002, -95.363271)</th>\n",
" <td>1629</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(38.900112, -77.01636500000001)</th>\n",
" <td>1659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(36.0, 138.0)</th>\n",
" <td>1714</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(40.714269, -74.005973)</th>\n",
" <td>2807</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(34.052234000000006, -118.24368500000001)</th>\n",
" <td>2988</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(40.014986, -105.270546)</th>\n",
" <td>3294</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(47.606209, -122.332071)</th>\n",
" <td>3856</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(37.774929, -122.41941499999999)</th>\n",
" <td>4817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(39.739154, -104.984703)</th>\n",
" <td>7441</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Coordinates\n",
"Coordinates \n",
"(33.748995, -84.387982) 1614\n",
"(29.763284000000002, -95.363271) 1629\n",
"(38.900112, -77.01636500000001) 1659\n",
"(36.0, 138.0) 1714\n",
"(40.714269, -74.005973) 2807\n",
"(34.052234000000006, -118.24368500000001) 2988\n",
"(40.014986, -105.270546) 3294\n",
"(47.606209, -122.332071) 3856\n",
"(37.774929, -122.41941499999999) 4817\n",
"(39.739154, -104.984703) 7441"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"place_counts = place_groups['Coordinates'].count()\n",
"place_rankings = place_counts.sort_values().tail(10).to_frame()\n",
"place_rankings"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that we have top 10 most visited locations, and at a glance, there does not seem to any form of duplicates. Hence, we can just load them into Bing's API to see what kind of places are these to make some inferences.\n",
"\n",
"Below are the code for the BingAPI, which is pretty much identical to our previous version of it with the phone dataset."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def tuple_str_to_list(string):\n",
" string = string.replace('(', '')\n",
" string = string.replace(')', '')\n",
" return [float(s) for s in string.split(',')]"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"\\nrankings = place_rankings.to_frame()\\nlocations = rankings['Coordinates'].index\\nimport geocoder\\nbing_key = 'AiEfap-qUoZalL1qK8ollM-SwVdoJFemh60tHo0EeraVYP8V4WPJXAVD2YjqzgA1'\\ncoordinates = locations\\naddr_file = open('Datasets/address_kite.txt', 'w+', encoding='utf-8')\\nfor cord in coordinates:\\n cord_list = tuple_str_to_list(cord)\\n g = geocoder.bing(cord_list, method = 'reverse', key = bing_key)\\n for r in g:\\n if r.city == None:\\n line_str = r.address + ',' + r.country + '\\n'\\n else:\\n line_str = r.address + ',' + r.city + ',' + r.country + '\\n'\\n addr_file.write(str(cord_list) + ' : ' + line_str)\\n print('{} : {}'.format(cord_list, line_str))\\naddr_file.close()\\nprint('finish address!')\\n\""
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"rankings = place_rankings.to_frame()\n",
"locations = rankings['Coordinates'].index\n",
"import geocoder\n",
"bing_key = 'AiEfap-qUoZalL1qK8ollM-SwVdoJFemh60tHo0EeraVYP8V4WPJXAVD2YjqzgA1'\n",
"coordinates = locations\n",
"addr_file = open('Datasets/address_kite.txt', 'w+', encoding='utf-8')\n",
"for cord in coordinates:\n",
" cord_list = tuple_str_to_list(cord)\n",
" g = geocoder.bing(cord_list, method = 'reverse', key = bing_key)\n",
" for r in g:\n",
" if r.city == None:\n",
" line_str = r.address + ',' + r.country + '\\n'\n",
" else:\n",
" line_str = r.address + ',' + r.city + ',' + r.country + '\\n'\n",
" addr_file.write(str(cord_list) + ' : ' + line_str)\n",
" print('{} : {}'.format(cord_list, line_str))\n",
"addr_file.close()\n",
"print('finish address!')\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Coordinates</th>\n",
" <th>address</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Coordinates</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>(33.748995, -84.387982)</th>\n",
" <td>1614</td>\n",
" <td>Capitol Ave SE, Atlanta, GA 30303,Downtown,Un...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(29.763284000000002, -95.363271)</th>\n",
" <td>1629</td>\n",
" <td>777 Preston St, Houston, TX 77002,Downtown,Un...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(38.900112, -77.01636500000001)</th>\n",
" <td>1659</td>\n",
" <td>406 Massachusetts Ave NW, Washington, DC 2000...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(36.0, 138.0)</th>\n",
" <td>1714</td>\n",
" <td>399, Japan,Japan</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(40.714269, -74.005973)</th>\n",
" <td>2807</td>\n",
" <td>280 Broadway, New York, NY 10007,Civic Center...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(34.052234000000006, -118.24368500000001)</th>\n",
" <td>2988</td>\n",
" <td>100 W 1st St, Los Angeles, CA 90012,Los Angel...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(40.014986, -105.270546)</th>\n",
" <td>3294</td>\n",
" <td>1515 19th St, Boulder, CO 80302,Goss-Grove,Un...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(47.606209, -122.332071)</th>\n",
" <td>3856</td>\n",
" <td>909 5th Ave, Seattle, WA 98104,Seattle,United...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(37.774929, -122.41941499999999)</th>\n",
" <td>4817</td>\n",
" <td>1541 Market St, San Francisco, CA 94103,South...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>(39.739154, -104.984703)</th>\n",
" <td>7441</td>\n",
" <td>1347 N Sherman St, Denver, CO 80203,Capitol H...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Coordinates \\\n",
"Coordinates \n",
"(33.748995, -84.387982) 1614 \n",
"(29.763284000000002, -95.363271) 1629 \n",
"(38.900112, -77.01636500000001) 1659 \n",
"(36.0, 138.0) 1714 \n",
"(40.714269, -74.005973) 2807 \n",
"(34.052234000000006, -118.24368500000001) 2988 \n",
"(40.014986, -105.270546) 3294 \n",
"(47.606209, -122.332071) 3856 \n",
"(37.774929, -122.41941499999999) 4817 \n",
"(39.739154, -104.984703) 7441 \n",
"\n",
" address \n",
"Coordinates \n",
"(33.748995, -84.387982) Capitol Ave SE, Atlanta, GA 30303,Downtown,Un... \n",
"(29.763284000000002, -95.363271) 777 Preston St, Houston, TX 77002,Downtown,Un... \n",
"(38.900112, -77.01636500000001) 406 Massachusetts Ave NW, Washington, DC 2000... \n",
"(36.0, 138.0) 399, Japan,Japan \n",
"(40.714269, -74.005973) 280 Broadway, New York, NY 10007,Civic Center... \n",
"(34.052234000000006, -118.24368500000001) 100 W 1st St, Los Angeles, CA 90012,Los Angel... \n",
"(40.014986, -105.270546) 1515 19th St, Boulder, CO 80302,Goss-Grove,Un... \n",
"(47.606209, -122.332071) 909 5th Ave, Seattle, WA 98104,Seattle,United... \n",
"(37.774929, -122.41941499999999) 1541 Market St, San Francisco, CA 94103,South... \n",
"(39.739154, -104.984703) 1347 N Sherman St, Denver, CO 80203,Capitol H... "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"address_kite = pd.read_csv('Datasets/address_kite.txt', sep=':', header=None)\n",
"place_rankings['address'] = address_kite[1].values\n",
"place_rankings"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Top Visited Places Analysis\n",
"##### Number 1: Colorado State Capitol"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"place_0 = place_groups.get_group('(39.739154, -104.984703)')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 636.000000\n",
"mean 11.699686\n",
"std 6.338688\n",
"min 2.000000\n",
"25% 7.000000\n",
"50% 12.000000\n",
"75% 17.000000\n",
"max 27.000000\n",
"Name: Date, dtype: float64\n"
]
}
],
"source": [
"date_groups = place_0.groupby(['Date'], group_keys=True)\n",
"date_counts = date_groups['Date'].count()\n",
"date_rankings = date_counts.sort_values()\n",
"print(date_rankings.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The Colorado State Capital is the most visited place in this dataset, with a mean of 11 users visiting per day."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Number 2: Bus Stop in San Francisco"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"place_1 = place_groups.get_group('(37.774929, -122.41941499999999)')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 651.000000\n",
"mean 7.399386\n",
"std 4.109681\n",
"min 2.000000\n",
"25% 4.000000\n",
"50% 7.000000\n",
"75% 10.000000\n",
"max 21.000000\n",
"Name: Date, dtype: float64\n"
]
}
],
"source": [
"date_groups = place_1.groupby(['Date'], group_keys=True)\n",
"date_counts = date_groups['Date'].count()\n",
"date_rankings = date_counts.sort_values()\n",
"print(date_rankings.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A bus stop on Market Street, also with what looks like an entrance to the subway, in San Francisco has a mean of 7 users visiting per day."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Number 3: Fourth and Madison Building in Seattle"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"place_2 = place_groups.get_group('(47.606209, -122.332071)')"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 569.000000\n",
"mean 6.776801\n",
"std 2.957639\n",
"min 2.000000\n",
"25% 5.000000\n",
"50% 7.000000\n",
"75% 9.000000\n",
"max 17.000000\n",
"Name: Date, dtype: float64\n"
]
}
],
"source": [
"date_groups = place_2.groupby(['Date'], group_keys=True)\n",
"date_counts = date_groups['Date'].count()\n",
"date_rankings = date_counts.sort_values()\n",
"print(date_rankings.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The Fourth and Madison Building in Seattle, Washington is a large skyscraper that houses many different companies. It follows close behind the San Francisco bus stop with a little less than 7 users visiting per day."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Number 4: Boulder, Colorado"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 532.000000\n",
"mean 6.191729\n",
"std 2.471854\n",
"min 2.000000\n",
"25% 4.000000\n",
"50% 6.000000\n",
"75% 8.000000\n",
"max 14.000000\n",
"Name: Date, dtype: float64\n"
]
}
],
"source": [
"place_3 = place_groups.get_group('(40.014986, -105.270546)')\n",
"date_groups = place_3.groupby(['Date'], group_keys=True)\n",
"date_counts = date_groups['Date'].count()\n",
"date_rankings = date_counts.sort_values()\n",
"print(date_rankings.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This location is in Boulder, Colorado, and it appears that there may be some sort of travel agency at this location. This location averages about 6 visits per day."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Number 5: Los Angeles Police Department Headquarters"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 665.000000\n",
"mean 4.493233\n",
"std 2.240851\n",
"min 2.000000\n",
"25% 3.000000\n",
"50% 4.000000\n",
"75% 6.000000\n",
"max 14.000000\n",
"Name: Date, dtype: float64\n"
]
}
],
"source": [
"place_4 = place_groups.get_group('(34.052234000000006, -118.24368500000001)')\n",
"date_groups = place_4.groupby(['Date'], group_keys=True)\n",
"date_counts = date_groups['Date'].count()\n",
"date_rankings = date_counts.sort_values()\n",
"print(date_rankings.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, this location is very intriguing, averaging about 4 users visiting per day with a max of 14 in one day! Are these users all police officers? Or are people checking in as they are brought into the station? Or are there just a lot of visitors at the police department?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Number 6: Forest in Japan"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 622.000000\n",
"mean 4.512862\n",
"std 2.145060\n",
"min 2.000000\n",
"25% 3.000000\n",
"50% 4.000000\n",
"75% 6.000000\n",
"max 13.000000\n",
"Name: Date, dtype: float64\n"
]
}
],
"source": [
"place_5 = place_groups.get_group('(40.714269, -74.005973)')\n",
"date_groups = place_5.groupby(['Date'], group_keys=True)\n",
"date_counts = date_groups['Date'].count()\n",
"date_rankings = date_counts.sort_values()\n",
"print(date_rankings.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This location is very odd since it is the only one not in the United States. Looking at google maps, it appears to be in a forest in Japan. So our only thought is that there is some sort of VPN out there that people are connecting to."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Number 7: 400 Massachusetts Ave, Washington D.C."
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 386.000000\n",
"mean 4.440415\n",
"std 2.827109\n",
"min 2.000000\n",
"25% 2.000000\n",
"50% 3.000000\n",
"75% 6.000000\n",
"max 17.000000\n",
"Name: Date, dtype: float64\n"
]
}
],
"source": [
"place_6 = place_groups.get_group('(36.0, 138.0)')\n",
"date_groups = place_6.groupby(['Date'], group_keys=True)\n",
"date_counts = date_groups['Date'].count()\n",
"date_rankings = date_counts.sort_values()\n",
"print(date_rankings.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A very popular address known to most people living or working in D.C., 400 Massachusetts Avenue is home to a 13 story building that houses many condos as well as rents out street level space to retailers. The average 4 users that visit per day could therefore be either residents or shoppers."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Number 8: The Dawson Thomas Law Group"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 424.000000\n",
"mean 3.912736\n",
"std 1.759655\n",
"min 2.000000\n",
"25% 2.000000\n",
"50% 4.000000\n",
"75% 5.000000\n",
"max 13.000000\n",
"Name: Date, dtype: float64\n"
]
}
],
"source": [
"place_7 = place_groups.get_group('(38.900112, -77.01636500000001)')\n",
"date_groups = place_7.groupby(['Date'], group_keys=True)\n",
"date_counts = date_groups['Date'].count()\n",
"date_rankings = date_counts.sort_values()\n",
"print(date_rankings.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The Dawson Thomas Law"
]
},
{
"cell_type": "markdown",