From 02917bbfb5f27ff0480bcd93468aab1ca942cabf Mon Sep 17 00:00:00 2001 From: Ben Xu Date: Tue, 7 May 2019 09:25:39 -0400 Subject: [PATCH] added reduced data --- .../Location_Analysis-checkpoint.ipynb | 470 +++++++++- .../User Comparison-checkpoint.ipynb | 850 ++++++++++++++++++ Datasets/address_kite.txt | 10 + Location_Analysis.ipynb | 470 +++++++++- User Comparison.ipynb | 850 ++++++++++++++++++ 5 files changed, 2588 insertions(+), 62 deletions(-) create mode 100644 .ipynb_checkpoints/User Comparison-checkpoint.ipynb create mode 100644 Datasets/address_kite.txt create mode 100644 User Comparison.ipynb diff --git a/.ipynb_checkpoints/Location_Analysis-checkpoint.ipynb b/.ipynb_checkpoints/Location_Analysis-checkpoint.ipynb index 3473310..dbd6f6d 100644 --- a/.ipynb_checkpoints/Location_Analysis-checkpoint.ipynb +++ b/.ipynb_checkpoints/Location_Analysis-checkpoint.ipynb @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -68,8 +68,7 @@ " phone_data_new.close()\n", " print('finsh making file!')\n", "\n", - "#convert_json_history('Datasets/phone_history.json')\n", - "# Already ran, created phone_data.txt in Datasets " + "#convert_json_history('Datasets/phone_history.json') # Already ran, created phone_data.txt in Datasets" ] }, { @@ -84,7 +83,24 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import datetime\n", + "import warnings\n", + "from urllib.request import urlopen\n", + "import json\n", + "import re\n", + "\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": { "scrolled": true }, @@ -165,16 +181,12 @@ "4 2014-11-03 21:47:49.271 40.421286 -3.628635 33" ] }, - "execution_count": 2, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import datetime\n", - "\n", "phone_df = pd.read_csv('Datasets/phone_data.txt', parse_dates=['Dates'], sep='\\t')\n", "phone_df.head()" ] @@ -188,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -261,7 +273,7 @@ "4 2014-11-03 21:47:49.271 33 (40.421286200000004, -3.6286354)" ] }, - "execution_count": 3, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -281,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -333,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -441,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -507,7 +519,7 @@ "2014-10-04 1413" ] }, - "execution_count": 7, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -521,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -597,7 +609,7 @@ "max 1429.000000" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -616,7 +628,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -682,7 +694,7 @@ "(39.8723041, -5.5697503) 2" ] }, - "execution_count": 17, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -703,7 +715,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -769,7 +781,7 @@ "(40.4207763, -3.6332088) 655" ] }, - "execution_count": 18, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -789,7 +801,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -863,7 +875,7 @@ "4 (40.4207763, -3.6332088) 655" ] }, - "execution_count": 19, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -931,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 17, "metadata": { "scrolled": true }, @@ -1031,7 +1043,7 @@ " Avenida de Fuencarral, 18, 28108 Alcobendas (M... 567" ] }, - "execution_count": 20, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1053,7 +1065,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -1065,7 +1077,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "metadata": { "scrolled": true }, @@ -1083,7 +1095,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 20, "metadata": { "scrolled": false }, @@ -1091,13 +1103,13 @@ { "data": { "text/html": [ - "
" + "
" ], "text/plain": [ - "" + "" ] }, - "execution_count": 26, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1137,6 +1149,402 @@ "Next, we will be looking at social network location datasets and try to see if we can find similar info for group of users rather than a specific individual." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Brightkite - Reading Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by reading the csv files of our datasets into a pandas DataFrame using pd.read_csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data = pd.read_csv('Datasets/Brightkite.txt', delim_whitespace = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see below that the Brightkite file contains 5 columns: UserID, Time, Latitude, Longitude, and PlaceID \n", + "\n", + "UserID: An ID number corresponding to an individual User. These are currently repeated (not linkably) across datasets, so we will have to find a way to ensure all IDs are unique in the future \n", + "\n", + "Time: The time(s) a user visited a location \n", + "\n", + "Latitude and Longitude: coordinates of actual location \n", + "\n", + "PlaceID: a place that corresponds to that location " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'Brightkite_data' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mBrightkite_data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mNameError\u001b[0m: name 'Brightkite_data' is not defined" + ] + } + ], + "source": [ + "Brightkite_data.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Combine Latitude and Longitude into one column: Coordinates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data['Coordinates'] = tuple(Brightkite_data[['Latitude','Longitude']].values)\n", + "Brightkite_data.drop(['Latitude','Longitude'], axis=1, inplace = True)\n", + "Brightkite_data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'Brightkite_data' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mBrightkite_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'UserID'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mNameError\u001b[0m: name 'Brightkite_data' is not defined" + ] + } + ], + "source": [ + "Brightkite_data['UserID'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Preprocessing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#this code is used to find places with unusually high rates of visitors\n", + "place_groups = Brightkite_data.groupby(['Coordinates'], group_keys=True)\n", + "location_counts = place_groups['Coordinates'].count()\n", + "location_rankings = location_counts.sort_values().tail(40)\n", + "location_rankings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#remove NAs. There are over 256000 datapoints at (0.0,0.0). This is the middle of nowhere.\n", + "#It is safe to assume these people didn't actually go there.\n", + "Brightkite_data = Brightkite_data[Brightkite_data.Coordinates != (0.0,0.0)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_by_user = Brightkite_data.groupby(['UserID'], group_keys = True)#group by person\n", + "places_per_user = group_by_user['Coordinates'].unique() #find number of unique locations each person visited" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "places_per_user[0].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "countin = 0\n", + "countout = 0\n", + "keepers = []\n", + "minimum_places = 15\n", + "for i in places_per_user:\n", + " if i.size < minimum_places:\n", + " countout += 1\n", + " keepers.append(False)\n", + " else:\n", + " countin += 1\n", + " keepers.append(True)\n", + "print(countin, countout)\n", + "print(places_per_user.index)\n", + "print(type(places_per_user))\n", + "#the only problem with keepers is that its index does not match the index of places per user.\n", + "#if we can get index in keepers to match userID like it does in places per user it might help\n", + "\n", + "#now we have the list of all users who have more than minimum_places unique coordinates logged" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We were able to successfully separate users who provided sufficient data from those who didn't." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(countin, countout)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Below you can see the location data of the users who were able to provide sufficient data, along with their user ID's." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(places_per_user[keepers].head(10))\n", + "print(places_per_user[keepers].tail(10))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " At this point we can see all of the users that we want and the ones that we don't want. Now it is time to sort them out." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Finalize the Processed Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "invalid_users = []\n", + "for i in places_per_user.index:\n", + " if places_per_user[i].size < minimum_places:\n", + " invalid_users.append(i)\n", + "Brightkite_data = Brightkite_data[~Brightkite_data['UserID'].isin(invalid_users)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convert Datetime into Date and Time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The datasets we used presented its \"Time\" information in an unusable string format. In order to make use of this data, we had to convert it into something easier to utilize." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "time = Brightkite_data['Time'].str.split(\"T\", n = 1, expand = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data.drop(columns=[\"Time\"], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data['Date'] = time[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data['Time'] = time[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'Brightkite_data' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mBrightkite_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Time'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBrightkite_data\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Time'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mslice\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mNameError\u001b[0m: name 'Brightkite_data' is not defined" + ] + } + ], + "source": [ + "Brightkite_data['Time'] = Brightkite_data['Time'].str.slice(0,-4,1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " In order to tell which users have gone to different locations together, we split the time strings into Date and Time. We then removed the old version of the data, as it was no longer necessary. After that we put in the new \"Date\" and \"Time\" columns. We also converted time to simply contain minutes and seconds. This made the data much easier to work with." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Remove duplicates and save \n", + "\n", + "Sometimes users visit the same place multiple times a day, they may also check in to the same place several times during one visit. We decided to eliminate these repeat visits and just count whether or not a person visited a given place at all during the day" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data = Brightkite_data[~Brightkite_data.duplicated(['UserID','Coordinates','Date'],keep='first')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data.to_csv('Datasets/Brightkite_light.txt')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenges\n", + "Throughout this project, we have encounter numerous challenges that we have manage to overcome.\n", + "1. Large dataset, which results long run time or not capable of running on laptops.\n", + "Solution: Pre-process the dataset by removing useless informations such as places that only been visited once or twice to reduce the number of entries.\n", + "2. Limitation of Location API - Bing\n", + "Solution: Similar to previous, pre-process the dataset to make it smaller, so we don't have the risk of going over the free limits and long run time.\n", + "\n", + "\n", + "# Future Research\n", + "There various ways that could take this concept into the next level and do much more. We will talk specific toward the phone dataset and the brightkite dataset.\n", + "\n", + "## Phone Dataset\n", + "1. Use the accuracy column to get a more precise result of locations. Also, this can be apply to Google's Place API to get point of interest around that circle of radius in meters.\n", + "2. Resample the time intervals by a smaller time frame, such as looking at location changes every 6 hours to get a more precise routing map\n", + "\n", + "## BrightKite Dataset\n", + "1. Route tracing for multiple users at once to get a better sense of where do people go\n", + "2. Find details of possible home address of each users\n", + "\n", + "Additionally, there is also the possiblilty of finding more about the users by reference other datasets.\n", + "\n", + "\n", + "# Conclusion\n", + "Overall, we found out that location dataset could be very dangerous for anyone to collect. With sufficent location information, it is rather easy identify users' private informations solely using location data. This is clearly shown by our analysis in both of our datasets. We were capable of finding popular user locations and general area of operations for each users. Also, by knowing exactly where users normally go, we can draw inferences about users based on this. Hence, users should be very careful when sharing location data because this can provide critical information to external parties that can be use against you. Especially with social media, given people normally provide their date of birth, by knowing the precise home address of users, it is rather easy to uniquely identify individuals. " + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/.ipynb_checkpoints/User Comparison-checkpoint.ipynb b/.ipynb_checkpoints/User Comparison-checkpoint.ipynb new file mode 100644 index 0000000..93b13b6 --- /dev/null +++ b/.ipynb_checkpoints/User Comparison-checkpoint.ipynb @@ -0,0 +1,850 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlopen\n", + "import json\n", + "import re\n", + "import pandas as pd\n", + "import numpy as np\n", + "import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data = pd.read_csv('Datasets/Brightkite_light.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Unnamed: 0', 'UserID', 'PlaceID', 'Coordinates', 'Date', 'Time'], dtype='object')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Brightkite_data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "Brightkite_data.drop(columns=['Unnamed: 0'], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### We don't need places with only one visit per day" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "multi_visit_data = Brightkite_data[(Brightkite_data.duplicated(['Coordinates','Date'], keep = 'first') | Brightkite_data.duplicated(['Coordinates','Date'], keep = 'last'))]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What Are the Most Visited Places" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "place_groups = multi_visit_data.groupby(['Coordinates'], group_keys=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Coordinates
Coordinates
(33.748995, -84.387982)1614
(29.763284000000002, -95.363271)1629
(38.900112, -77.01636500000001)1659
(36.0, 138.0)1714
(40.714269, -74.005973)2807
(34.052234000000006, -118.24368500000001)2988
(40.014986, -105.270546)3294
(47.606209, -122.332071)3856
(37.774929, -122.41941499999999)4817
(39.739154, -104.984703)7441
\n", + "
" + ], + "text/plain": [ + " Coordinates\n", + "Coordinates \n", + "(33.748995, -84.387982) 1614\n", + "(29.763284000000002, -95.363271) 1629\n", + "(38.900112, -77.01636500000001) 1659\n", + "(36.0, 138.0) 1714\n", + "(40.714269, -74.005973) 2807\n", + "(34.052234000000006, -118.24368500000001) 2988\n", + "(40.014986, -105.270546) 3294\n", + "(47.606209, -122.332071) 3856\n", + "(37.774929, -122.41941499999999) 4817\n", + "(39.739154, -104.984703) 7441" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "place_counts = place_groups['Coordinates'].count()\n", + "place_rankings = place_counts.sort_values().tail(10).to_frame()\n", + "place_rankings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have top 10 most visited locations, and at a glance, there does not seem to any form of duplicates. Hence, we can just load them into Bing's API to see what kind of places are these to make some inferences.\n", + "\n", + "Below are the code for the BingAPI, which is pretty much identical to our previous version of it with the phone dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def tuple_str_to_list(string):\n", + " string = string.replace('(', '')\n", + " string = string.replace(')', '')\n", + " return [float(s) for s in string.split(',')]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"\\nrankings = place_rankings.to_frame()\\nlocations = rankings['Coordinates'].index\\nimport geocoder\\nbing_key = 'AiEfap-qUoZalL1qK8ollM-SwVdoJFemh60tHo0EeraVYP8V4WPJXAVD2YjqzgA1'\\ncoordinates = locations\\naddr_file = open('Datasets/address_kite.txt', 'w+', encoding='utf-8')\\nfor cord in coordinates:\\n cord_list = tuple_str_to_list(cord)\\n g = geocoder.bing(cord_list, method = 'reverse', key = bing_key)\\n for r in g:\\n if r.city == None:\\n line_str = r.address + ',' + r.country + '\\n'\\n else:\\n line_str = r.address + ',' + r.city + ',' + r.country + '\\n'\\n addr_file.write(str(cord_list) + ' : ' + line_str)\\n print('{} : {}'.format(cord_list, line_str))\\naddr_file.close()\\nprint('finish address!')\\n\"" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'''\n", + "rankings = place_rankings.to_frame()\n", + "locations = rankings['Coordinates'].index\n", + "import geocoder\n", + "bing_key = 'AiEfap-qUoZalL1qK8ollM-SwVdoJFemh60tHo0EeraVYP8V4WPJXAVD2YjqzgA1'\n", + "coordinates = locations\n", + "addr_file = open('Datasets/address_kite.txt', 'w+', encoding='utf-8')\n", + "for cord in coordinates:\n", + " cord_list = tuple_str_to_list(cord)\n", + " g = geocoder.bing(cord_list, method = 'reverse', key = bing_key)\n", + " for r in g:\n", + " if r.city == None:\n", + " line_str = r.address + ',' + r.country + '\\n'\n", + " else:\n", + " line_str = r.address + ',' + r.city + ',' + r.country + '\\n'\n", + " addr_file.write(str(cord_list) + ' : ' + line_str)\n", + " print('{} : {}'.format(cord_list, line_str))\n", + "addr_file.close()\n", + "print('finish address!')\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Coordinatesaddress
Coordinates
(33.748995, -84.387982)1614Capitol Ave SE, Atlanta, GA 30303,Downtown,Un...
(29.763284000000002, -95.363271)1629777 Preston St, Houston, TX 77002,Downtown,Un...
(38.900112, -77.01636500000001)1659406 Massachusetts Ave NW, Washington, DC 2000...
(36.0, 138.0)1714399, Japan,Japan
(40.714269, -74.005973)2807280 Broadway, New York, NY 10007,Civic Center...
(34.052234000000006, -118.24368500000001)2988100 W 1st St, Los Angeles, CA 90012,Los Angel...
(40.014986, -105.270546)32941515 19th St, Boulder, CO 80302,Goss-Grove,Un...
(47.606209, -122.332071)3856909 5th Ave, Seattle, WA 98104,Seattle,United...
(37.774929, -122.41941499999999)48171541 Market St, San Francisco, CA 94103,South...
(39.739154, -104.984703)74411347 N Sherman St, Denver, CO 80203,Capitol H...
\n", + "
" + ], + "text/plain": [ + " Coordinates \\\n", + "Coordinates \n", + "(33.748995, -84.387982) 1614 \n", + "(29.763284000000002, -95.363271) 1629 \n", + "(38.900112, -77.01636500000001) 1659 \n", + "(36.0, 138.0) 1714 \n", + "(40.714269, -74.005973) 2807 \n", + "(34.052234000000006, -118.24368500000001) 2988 \n", + "(40.014986, -105.270546) 3294 \n", + "(47.606209, -122.332071) 3856 \n", + "(37.774929, -122.41941499999999) 4817 \n", + "(39.739154, -104.984703) 7441 \n", + "\n", + " address \n", + "Coordinates \n", + "(33.748995, -84.387982) Capitol Ave SE, Atlanta, GA 30303,Downtown,Un... \n", + "(29.763284000000002, -95.363271) 777 Preston St, Houston, TX 77002,Downtown,Un... \n", + "(38.900112, -77.01636500000001) 406 Massachusetts Ave NW, Washington, DC 2000... \n", + "(36.0, 138.0) 399, Japan,Japan \n", + "(40.714269, -74.005973) 280 Broadway, New York, NY 10007,Civic Center... \n", + "(34.052234000000006, -118.24368500000001) 100 W 1st St, Los Angeles, CA 90012,Los Angel... \n", + "(40.014986, -105.270546) 1515 19th St, Boulder, CO 80302,Goss-Grove,Un... \n", + "(47.606209, -122.332071) 909 5th Ave, Seattle, WA 98104,Seattle,United... \n", + "(37.774929, -122.41941499999999) 1541 Market St, San Francisco, CA 94103,South... \n", + "(39.739154, -104.984703) 1347 N Sherman St, Denver, CO 80203,Capitol H... " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "address_kite = pd.read_csv('Datasets/address_kite.txt', sep=':', header=None)\n", + "place_rankings['address'] = address_kite[1].values\n", + "place_rankings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Top Visited Places Analysis\n", + "##### Number 1: Colorado State Capitol" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "place_0 = place_groups.get_group('(39.739154, -104.984703)')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 636.000000\n", + "mean 11.699686\n", + "std 6.338688\n", + "min 2.000000\n", + "25% 7.000000\n", + "50% 12.000000\n", + "75% 17.000000\n", + "max 27.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "date_groups = place_0.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values()\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Colorado State Capital is the most visited place in this dataset, with a mean of 11 users visiting per day." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Number 2: Bus Stop in San Francisco" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "place_1 = place_groups.get_group('(37.774929, -122.41941499999999)')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 651.000000\n", + "mean 7.399386\n", + "std 4.109681\n", + "min 2.000000\n", + "25% 4.000000\n", + "50% 7.000000\n", + "75% 10.000000\n", + "max 21.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "date_groups = place_1.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values()\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A bus stop on Market Street, also with what looks like an entrance to the subway, in San Francisco has a mean of 7 users visiting per day." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Number 3: Fourth and Madison Building in Seattle" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "place_2 = place_groups.get_group('(47.606209, -122.332071)')" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 569.000000\n", + "mean 6.776801\n", + "std 2.957639\n", + "min 2.000000\n", + "25% 5.000000\n", + "50% 7.000000\n", + "75% 9.000000\n", + "max 17.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "date_groups = place_2.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values()\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Fourth and Madison Building in Seattle, Washington is a large skyscraper that houses many different companies. It follows close behind the San Francisco bus stop with a little less than 7 users visiting per day." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Number 4: Boulder, Colorado" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 532.000000\n", + "mean 6.191729\n", + "std 2.471854\n", + "min 2.000000\n", + "25% 4.000000\n", + "50% 6.000000\n", + "75% 8.000000\n", + "max 14.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "place_3 = place_groups.get_group('(40.014986, -105.270546)')\n", + "date_groups = place_3.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values()\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This location is in Boulder, Colorado, and it appears that there may be some sort of travel agency at this location. This location averages about 6 visits per day." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Number 5: Los Angeles Police Department Headquarters" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 665.000000\n", + "mean 4.493233\n", + "std 2.240851\n", + "min 2.000000\n", + "25% 3.000000\n", + "50% 4.000000\n", + "75% 6.000000\n", + "max 14.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "place_4 = place_groups.get_group('(34.052234000000006, -118.24368500000001)')\n", + "date_groups = place_4.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values()\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, this location is very intriguing, averaging about 4 users visiting per day with a max of 14 in one day! Are these users all police officers? Or are people checking in as they are brought into the station? Or are there just a lot of visitors at the police department?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Number 6: Forest in Japan" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 622.000000\n", + "mean 4.512862\n", + "std 2.145060\n", + "min 2.000000\n", + "25% 3.000000\n", + "50% 4.000000\n", + "75% 6.000000\n", + "max 13.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "place_5 = place_groups.get_group('(40.714269, -74.005973)')\n", + "date_groups = place_5.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values()\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This location is very odd since it is the only one not in the United States. Looking at google maps, it appears to be in a forest in Japan. So our only thought is that there is some sort of VPN out there that people are connecting to." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Number 7: 400 Massachusetts Ave, Washington D.C." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 386.000000\n", + "mean 4.440415\n", + "std 2.827109\n", + "min 2.000000\n", + "25% 2.000000\n", + "50% 3.000000\n", + "75% 6.000000\n", + "max 17.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "place_6 = place_groups.get_group('(36.0, 138.0)')\n", + "date_groups = place_6.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values()\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A very popular address known to most people living or working in D.C., 400 Massachusetts Avenue is home to a 13 story building that houses many condos as well as rents out street level space to retailers. The average 4 users that visit per day could therefore be either residents or shoppers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Number 8: The Dawson Thomas Law Group" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 424.000000\n", + "mean 3.912736\n", + "std 1.759655\n", + "min 2.000000\n", + "25% 2.000000\n", + "50% 4.000000\n", + "75% 5.000000\n", + "max 13.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "place_7 = place_groups.get_group('(38.900112, -77.01636500000001)')\n", + "date_groups = place_7.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values()\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Dawson Thomas Law" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Number 9" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 10.000000\n", + "mean 6.700000\n", + "std 0.483046\n", + "min 6.000000\n", + "25% 6.250000\n", + "50% 7.000000\n", + "75% 7.000000\n", + "max 7.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "place_8 = place_groups.get_group('(29.763284000000002, -95.363271)')\n", + "date_groups = place_8.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values().tail(10)\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Number 10" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count 10.000000\n", + "mean 7.300000\n", + "std 0.483046\n", + "min 7.000000\n", + "25% 7.000000\n", + "50% 7.000000\n", + "75% 7.750000\n", + "max 8.000000\n", + "Name: Date, dtype: float64\n" + ] + } + ], + "source": [ + "place_9 = place_groups.get_group('(33.748995, -84.387982)')\n", + "date_groups = place_9.groupby(['Date'], group_keys=True)\n", + "date_counts = date_groups['Date'].count()\n", + "date_rankings = date_counts.sort_values().tail(10)\n", + "print(date_rankings.describe())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Datasets/address_kite.txt b/Datasets/address_kite.txt new file mode 100644 index 0000000..3ed84c0 --- /dev/null +++ b/Datasets/address_kite.txt @@ -0,0 +1,10 @@ +[33.748995, -84.387982] : Capitol Ave SE, Atlanta, GA 30303,Downtown,United States +[29.763284000000002, -95.363271] : 777 Preston St, Houston, TX 77002,Downtown,United States +[38.900112, -77.01636500000001] : 406 Massachusetts Ave NW, Washington, DC 20001,Downtown,United States +[36.0, 138.0] : 399, Japan,Japan +[40.714269, -74.005973] : 280 Broadway, New York, NY 10007,Civic Center,United States +[34.052234000000006, -118.24368500000001] : 100 W 1st St, Los Angeles, CA 90012,Los Angeles,United States +[40.014986, -105.270546] : 1515 19th St, Boulder, CO 80302,Goss-Grove,United States +[47.606209, -122.332071] : 909 5th Ave, Seattle, WA 98104,Seattle,United States +[37.774929, -122.41941499999999] : 1541 Market St, San Francisco, CA 94103,South of Market,United States +[39.739154, -104.984703] : 1347 N Sherman St, Denver, CO 80203,Capitol Hill,United States diff --git a/Location_Analysis.ipynb b/Location_Analysis.ipynb index 3473310..dbd6f6d 100644 --- a/Location_Analysis.ipynb +++ b/Location_Analysis.ipynb @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -68,8 +68,7 @@ " phone_data_new.close()\n", " print('finsh making file!')\n", "\n", - "#convert_json_history('Datasets/phone_history.json')\n", - "# Already ran, created phone_data.txt in Datasets " + "#convert_json_history('Datasets/phone_history.json') # Already ran, created phone_data.txt in Datasets" ] }, { @@ -84,7 +83,24 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import datetime\n", + "import warnings\n", + "from urllib.request import urlopen\n", + "import json\n", + "import re\n", + "\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": { "scrolled": true }, @@ -165,16 +181,12 @@ "4 2014-11-03 21:47:49.271 40.421286 -3.628635 33" ] }, - "execution_count": 2, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import datetime\n", - "\n", "phone_df = pd.read_csv('Datasets/phone_data.txt', parse_dates=['Dates'], sep='\\t')\n", "phone_df.head()" ] @@ -188,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -261,7 +273,7 @@ "4 2014-11-03 21:47:49.271 33 (40.421286200000004, -3.6286354)" ] }, - "execution_count": 3, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -281,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -333,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -441,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -507,7 +519,7 @@ "2014-10-04 1413" ] }, - "execution_count": 7, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -521,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -597,7 +609,7 @@ "max 1429.000000" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -616,7 +628,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -682,7 +694,7 @@ "(39.8723041, -5.5697503) 2" ] }, - "execution_count": 17, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -703,7 +715,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -769,7 +781,7 @@ "(40.4207763, -3.6332088) 655" ] }, - "execution_count": 18, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -789,7 +801,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -863,7 +875,7 @@ "4 (40.4207763, -3.6332088) 655" ] }, - "execution_count": 19, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -931,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 17, "metadata": { "scrolled": true }, @@ -1031,7 +1043,7 @@ " Avenida de Fuencarral, 18, 28108 Alcobendas (M... 567" ] }, - "execution_count": 20, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1053,7 +1065,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -1065,7 +1077,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "metadata": { "scrolled": true }, @@ -1083,7 +1095,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 20, "metadata": { "scrolled": false }, @@ -1091,13 +1103,13 @@ { "data": { "text/html": [ - "