diff --git a/phidown/search.py b/phidown/search.py index 4e733e4..f1af8b5 100755 --- a/phidown/search.py +++ b/phidown/search.py @@ -84,6 +84,8 @@ def query_by_filter( start_date (str, optional): Start date for filtering (ISO 8601 format). Defaults to None. end_date (str, optional): End date for filtering (ISO 8601 format). Defaults to None. top (int, optional): Maximum number of results to retrieve. Defaults to 1000. + count (bool, optional): Enable result counting and automatic pagination. When True and + total results exceed 'top', will automatically paginate to retrieve all results. Defaults to False. order_by (str, optional): Field and direction to order results by. Defaults to "ContentDate/Start desc". """ self.base_url = base_url # Set or override base_url @@ -484,15 +486,66 @@ def _build_query(self): return self.url def execute_query(self): - """Execute the query and retrieve data""" + """Execute the query and retrieve data. + + If count=True and the total number of results exceeds the 'top' limit, + this method will automatically paginate through all results using + multiple requests with the $skip parameter, combining all results + into a single DataFrame. + + Returns: + pd.DataFrame: DataFrame containing all retrieved products. + """ url = self._build_query() self.response = copy.deepcopy(requests.get(url)) self.response.raise_for_status() # Raise an error for bad status codes self.json_data = self.response.json() self.num_results = self.json_data.get('@odata.count', 0) - self.df = pd.DataFrame.from_dict(self.json_data['value']) - + + # Check if pagination is needed + if self.count and self.num_results > self.top: + return self._execute_paginated_query() + else: + self.df = pd.DataFrame.from_dict(self.json_data['value']) + return self.df + + def _execute_paginated_query(self): + """Execute paginated queries when results exceed top limit""" + all_data = [] + skip = 0 + page_size = self.top # Use the current top value as page size + + # Add first page (already retrieved in execute_query) + if 'value' in self.json_data: + all_data.extend(self.json_data['value']) + + # Continue with pagination while there are more results + while skip + page_size < self.num_results: + skip += page_size + + # Build paginated query URL + paginated_query = f"?$filter={self.filter_condition}&$orderby={self.order_by}&$top={page_size}&$skip={skip}&$expand=Attributes" + if self.count: + paginated_query += "&$count=true" + + paginated_url = f"{self.base_url}{paginated_query}" + + # Make paginated request + try: + paginated_response = requests.get(paginated_url) + paginated_response.raise_for_status() + paginated_data = paginated_response.json() + + if 'value' in paginated_data: + all_data.extend(paginated_data['value']) + + except Exception as e: + print(f"Warning: Error retrieving page at skip={skip}: {e}") + break + + # Create DataFrame from all collected data + self.df = pd.DataFrame.from_dict(all_data) return self.df def query_by_name(self, product_name: str) -> pd.DataFrame: diff --git a/tests/test_pagination.py b/tests/test_pagination.py new file mode 100644 index 0000000..c83642e --- /dev/null +++ b/tests/test_pagination.py @@ -0,0 +1,179 @@ +import pytest +import sys, os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from phidown.search import CopernicusDataSearcher +from unittest.mock import Mock, patch +import pandas as pd + +# Define the path to the config file relative to the test file +CONFIG_PATH = os.path.join(os.path.dirname(__file__), '..', 'phidown', 'config.json') + + +def test_pagination_disabled_by_default(): + """Test that pagination is not triggered when count=False""" + searcher = CopernicusDataSearcher() + searcher.query_by_filter( + collection_name='SENTINEL-1', + product_type='SLC', + top=10, + count=False # Pagination should not trigger + ) + + # Mock response with large count + mock_response = Mock() + mock_response.json.return_value = { + 'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(10)], + '@odata.count': 1500 # More than top=10, but count=False + } + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response) as mock_get: + df = searcher.execute_query() + + # Should only make one request since count=False + assert mock_get.call_count == 1 + assert len(df) == 10 # Only the first page + + +def test_pagination_when_count_enabled_and_results_exceed_top(): + """Test pagination is triggered when count=True and results > top""" + searcher = CopernicusDataSearcher() + searcher.query_by_filter( + collection_name='SENTINEL-1', + product_type='SLC', + top=5, + count=True + ) + + # Mock responses for pagination + mock_response_1 = Mock() + mock_response_1.json.return_value = { + 'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(5)], + '@odata.count': 12 + } + mock_response_1.raise_for_status = Mock() + + mock_response_2 = Mock() + mock_response_2.json.return_value = { + 'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(5, 10)] + } + mock_response_2.raise_for_status = Mock() + + mock_response_3 = Mock() + mock_response_3.json.return_value = { + 'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(10, 12)] + } + mock_response_3.raise_for_status = Mock() + + with patch('requests.get', side_effect=[mock_response_1, mock_response_2, mock_response_3]) as mock_get: + df = searcher.execute_query() + + # Should make 3 requests total + assert mock_get.call_count == 3 + assert len(df) == 12 + + # Check that skip parameters were used correctly + calls = mock_get.call_args_list + assert '$skip=5' in calls[1][0][0] + assert '$skip=10' in calls[2][0][0] + + +def test_no_pagination_when_results_within_top_limit(): + """Test no pagination when count=True but results <= top""" + searcher = CopernicusDataSearcher() + searcher.query_by_filter( + collection_name='SENTINEL-1', + product_type='SLC', + top=100, + count=True + ) + + # Mock response with count less than top + mock_response = Mock() + mock_response.json.return_value = { + 'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(50)], + '@odata.count': 50 # Less than top=100 + } + mock_response.raise_for_status = Mock() + + with patch('requests.get', return_value=mock_response) as mock_get: + df = searcher.execute_query() + + # Should only make one request + assert mock_get.call_count == 1 + assert len(df) == 50 + + +def test_pagination_with_1000_page_size(): + """Test pagination with default page size of 1000""" + searcher = CopernicusDataSearcher() + searcher.query_by_filter( + collection_name='SENTINEL-1', + product_type='SLC', + top=1000, # Default page size + count=True + ) + + # Mock responses for large dataset + mock_response_1 = Mock() + mock_response_1.json.return_value = { + 'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(1000)], + '@odata.count': 2500 + } + mock_response_1.raise_for_status = Mock() + + mock_response_2 = Mock() + mock_response_2.json.return_value = { + 'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(1000, 2000)] + } + mock_response_2.raise_for_status = Mock() + + mock_response_3 = Mock() + mock_response_3.json.return_value = { + 'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(2000, 2500)] + } + mock_response_3.raise_for_status = Mock() + + with patch('requests.get', side_effect=[mock_response_1, mock_response_2, mock_response_3]) as mock_get: + df = searcher.execute_query() + + # Should make 3 requests total + assert mock_get.call_count == 3 + assert len(df) == 2500 + + # Check skip parameters + calls = mock_get.call_args_list + assert '$skip=1000' in calls[1][0][0] + assert '$skip=2000' in calls[2][0][0] + + +def test_pagination_handles_request_errors_gracefully(): + """Test that pagination handles request errors gracefully""" + searcher = CopernicusDataSearcher() + searcher.query_by_filter( + collection_name='SENTINEL-1', + product_type='SLC', + top=5, + count=True + ) + + # Mock first response successful + mock_response_1 = Mock() + mock_response_1.json.return_value = { + 'value': [{'Id': f'product_{i}', 'Name': f'name_{i}'} for i in range(5)], + '@odata.count': 15 + } + mock_response_1.raise_for_status = Mock() + + # Mock second response fails + mock_response_2 = Mock() + mock_response_2.raise_for_status.side_effect = Exception("Network error") + + with patch('requests.get', side_effect=[mock_response_1, mock_response_2]): + # Should not raise exception, but return partial results + df = searcher.execute_query() + + # Should return at least the first page + assert len(df) == 5 + assert 'product_0' in df['Id'].values \ No newline at end of file