From 2b2f06ca0c24bb1dcf43a9b6c152bcff347d5d0d Mon Sep 17 00:00:00 2001 From: ArtOfCode- Date: Fri, 19 Aug 2016 09:44:20 +0100 Subject: [PATCH 1/4] scrape last seen information --- chatexchange/_utils.py | 17 +++++++++++++++++ chatexchange/browser.py | 11 +++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/chatexchange/_utils.py b/chatexchange/_utils.py index 6b8c984..7163ef7 100644 --- a/chatexchange/_utils.py +++ b/chatexchange/_utils.py @@ -55,6 +55,23 @@ def html_to_text(html): return s.get_text() +# Number of seconds since the user was last seen, based on <12d ago> data. +def parse_last_seen(text): + suffixes = { + 's': 1, + 'm': 60, + 'h': 3600, + 'd': 86400, + 'y': 31536000 + } + splat = text.split(' ') + assert len(splat) == 2, "text doesn't appear to be in format" + char = text[-1] + number = int(text[:-1]) + assert char in suffixes, "suffix char unrecognized" + return number * suffixes[char] + + class LazyFrom(object): """ A descriptor used when multiple lazy attributes depend on a common diff --git a/chatexchange/browser.py b/chatexchange/browser.py index 393fa74..6304e4a 100644 --- a/chatexchange/browser.py +++ b/chatexchange/browser.py @@ -69,7 +69,7 @@ def _request( # Try again if we fail. We're blaming "the internet" for weirdness. MAX_HTTP_RETRIES = 5 # EGAD! A MAGIC NUMBER! attempt = 0 - while attempt <= MAX_HTTP_RETRIES: + while attempt <= MAX_HTTP_RETRIES: attempt += 1 response = None try: @@ -549,12 +549,19 @@ def get_profile(self, user_id): else: reputation = -1 + stats_elements = profile_soup.select('.user-valuecell') + if len(stats_elements) >= 3: + last_seen = _utils.parse_last_seen(stats_elements[2].text) + else: + last_seen = _utils.parse_last_seen('20y ago') + return { 'name': name, 'is_moderator': is_moderator, 'message_count': message_count, 'room_count': room_count, - 'reputation': reputation + 'reputation': reputation, + 'last_seen': last_seen } def get_room_info(self, room_id): From ecb54b129210dfbbf6c1846efec4ce57c3f1d624 Mon Sep 17 00:00:00 2001 From: Ralph Embree Date: Fri, 18 Nov 2016 19:41:28 +0000 Subject: [PATCH 2/4] Fix bug when logging in The user ID is retrieved by parsing the user URL. When using / as the delimiter, the user ID is at position 2 if the URL is of the form stackoverflow.com/user/0000/username, but will be at position 4 if the URL is of the form https://stackoverflow.com/user/0000/username. To fix this problem, find the ID from the end rather than from the beginning. My previous patch didn't account for the username at the end of the URL, so please ignore that patch. --- chatexchange/browser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chatexchange/browser.py b/chatexchange/browser.py index 6304e4a..170e40f 100644 --- a/chatexchange/browser.py +++ b/chatexchange/browser.py @@ -409,8 +409,8 @@ def get_transcript_with_message(self, message_id): 'transcript/message/%s' % (message_id,)) room_soups = transcript_soup.select('.room-name a') - room_soup = room_soups[-1] - room_id = int(room_soup['href'].split('/')[2]) + room_soup = room_soups[] + room_id = int(room_soup['href'].split('/')[-2]) room_name = room_soup.text messages_data = [] From 221c29e1983433364405a2c88d16ebdd6c6b690f Mon Sep 17 00:00:00 2001 From: Ralph Embree Date: Sat, 19 Nov 2016 04:52:48 +0000 Subject: [PATCH 3/4] Correct accidental deletion --- chatexchange/browser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chatexchange/browser.py b/chatexchange/browser.py index 170e40f..45efda9 100644 --- a/chatexchange/browser.py +++ b/chatexchange/browser.py @@ -409,7 +409,7 @@ def get_transcript_with_message(self, message_id): 'transcript/message/%s' % (message_id,)) room_soups = transcript_soup.select('.room-name a') - room_soup = room_soups[] + room_soup = room_soups[-1] room_id = int(room_soup['href'].split('/')[-2]) room_name = room_soup.text From 81a651851d9ba50c8f789374ca716d2ca26db5f0 Mon Sep 17 00:00:00 2001 From: Ralph Embree Date: Sat, 19 Nov 2016 14:12:14 +0000 Subject: [PATCH 4/4] Fix login bug The same as the last: If the protocol is included in the link, there will be an error --- chatexchange/browser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chatexchange/browser.py b/chatexchange/browser.py index 45efda9..97662dd 100644 --- a/chatexchange/browser.py +++ b/chatexchange/browser.py @@ -223,7 +223,7 @@ def _load_user(self, soup): @staticmethod def user_id_and_name_from_link(link_soup): user_name = link_soup.text - user_id = int(link_soup['href'].split('/')[2]) + user_id = int(link_soup['href'].split('/')[-2]) return user_id, user_name def _update_chat_fkey_and_user(self):