From 844333fca7bc92a11c43795cb50f5a6abbacc786 Mon Sep 17 00:00:00 2001 From: Prashanth R Date: Tue, 18 Jul 2023 21:32:13 -0700 Subject: [PATCH] NL: Switch NL place-detector default to `dc` (#2963) Also, remove the check-box from frontend. The diffs are pretty clean (except for an underlying bug - https://github.com/datacommonsorg/website/issues/2962). Finally, check for bad-words also from punctuation-removed query. --- server/integration_tests/nlnext_test.py | 10 +++--- .../usa_map_types/query_3/chart_config.json | 6 ++-- server/lib/nl/detection/place.py | 9 ++++-- server/routes/nl/api.py | 16 +++++----- shared/lib/utils.py | 7 +++-- static/js/apps/nl_interface/app.tsx | 8 ++--- static/js/apps/nl_interface/nl_options.tsx | 31 ------------------- static/js/apps/nl_interface/query_search.tsx | 4 --- 8 files changed, 31 insertions(+), 60 deletions(-) diff --git a/server/integration_tests/nlnext_test.py b/server/integration_tests/nlnext_test.py index 7080abce45..a6974dcec1 100644 --- a/server/integration_tests/nlnext_test.py +++ b/server/integration_tests/nlnext_test.py @@ -35,7 +35,7 @@ def run_sequence(self, detector='hybrid', check_place_detection=False, expected_detectors=[], - place_detector='ner', + place_detector='dc', failure=''): if detector == 'heuristic': detection_method = 'Heuristic Based' @@ -213,7 +213,8 @@ def test_place_detection_e2e_ner(self): 'show me the population of mexico city', 'counties in the US with the most poverty', ], - check_place_detection=True) + check_place_detection=True, + place_detector='ner') # This test uses DC's Recognize Places API. # TODO: "US" is not detected by RecognizePlaces. @@ -227,8 +228,7 @@ def test_place_detection_e2e_dc(self): 'show me the population of mexico city', 'counties in the US with the most poverty', ], - check_place_detection=True, - place_detector='dc') + check_place_detection=True) def test_international(self): self.run_sequence('international', [ @@ -249,7 +249,7 @@ def test_demo_usa_map_types(self): # Shows map of tracts. 'household median income across tracts of Placer County', # Shows map of ZCTAs. - 'how many people are unemployed in zip codes of washington state?' + 'how many people are unemployed in zip codes of washington?' ]) def test_sdg(self): diff --git a/server/integration_tests/test_data/usa_map_types/query_3/chart_config.json b/server/integration_tests/test_data/usa_map_types/query_3/chart_config.json index 72b10903c6..b1f6bab421 100644 --- a/server/integration_tests/test_data/usa_map_types/query_3/chart_config.json +++ b/server/integration_tests/test_data/usa_map_types/query_3/chart_config.json @@ -83,7 +83,7 @@ "items": [ { "displayName": "California", - "nlQuery": "how many people are unemployed in zip codes of california state?" + "nlQuery": "how many people are unemployed in zip codes of california?" } ] }, @@ -91,11 +91,11 @@ "items": [ { "displayName": "Show highest", - "nlQuery": "how many people are unemployed in zip codes of washington state? - show highest" + "nlQuery": "how many people are unemployed in zip codes of washington? - show highest" }, { "displayName": "Show lowest", - "nlQuery": "how many people are unemployed in zip codes of washington state? - show lowest" + "nlQuery": "how many people are unemployed in zip codes of washington? - show lowest" } ] } diff --git a/server/lib/nl/detection/place.py b/server/lib/nl/detection/place.py index f071de1f65..73c04ef930 100644 --- a/server/lib/nl/detection/place.py +++ b/server/lib/nl/detection/place.py @@ -94,7 +94,10 @@ def detect_from_query_ner(cleaned_query: str, orig_query: str, # Uses NER to detect place names, recons to DCIDs, produces PlaceDetection object. # def detect_from_query_dc(orig_query: str, debug_logs: Dict) -> PlaceDetection: - query_items = dc.recognize_places(orig_query) + # Recognize Places uses comma as a signal for contained-in-place. + query = utils.remove_punctuations(orig_query, include_comma=True) + + query_items = dc.recognize_places(query) nonplace_query_parts = [] places_str = [] @@ -131,7 +134,7 @@ def detect_from_query_dc(orig_query: str, debug_logs: Dict) -> PlaceDetection: # Set PlaceDetection. query_without_place_substr = ' '.join(nonplace_query_parts) place_detection = PlaceDetection( - query_original=orig_query, + query_original=query, query_without_place_substr=query_without_place_substr, query_places_mentioned=places_str, places_found=resolved_places, @@ -139,7 +142,7 @@ def detect_from_query_dc(orig_query: str, debug_logs: Dict) -> PlaceDetection: _set_query_detection_debug_logs(place_detection, debug_logs) # This only makes sense for this flow. debug_logs["query_transformations"] = { - "place_detection_input": orig_query, + "place_detection_input": query, "place_detection_with_places_removed": query_without_place_substr, } return place_detection diff --git a/server/routes/nl/api.py b/server/routes/nl/api.py index 6e523eadcf..6ce00d19f4 100644 --- a/server/routes/nl/api.py +++ b/server/routes/nl/api.py @@ -63,6 +63,7 @@ def data(): if not current_app.config.get('NL_BAD_WORDS'): logging.error('Missing NL_BAD_WORDS config!') flask.abort(404) + nl_bad_words = current_app.config['NL_BAD_WORDS'] disaster_config = current_app.config['NL_DISASTER_CONFIG'] if current_app.config['LOCAL']: @@ -82,7 +83,7 @@ def data(): 'detector', default=RequestedDetectorType.Heuristic.value, type=str) place_detector_type = request.args.get('place_detector', - default='ner', + default='dc', type=str).lower() if place_detector_type not in [PlaceDetectorType.NER, PlaceDetectorType.DC]: logging.error(f'Unknown place_detector {place_detector_type}') @@ -90,19 +91,20 @@ def data(): else: place_detector_type = PlaceDetectorType(place_detector_type) + query = str(escape(shared_utils.remove_punctuations(original_query))) + if not query: + return _abort('Received an empty query, please type a few words :)', + original_query, context_history) + # # Check offensive words # - if not bad_words.is_safe(original_query, current_app.config['NL_BAD_WORDS']): + if (not bad_words.is_safe(original_query, nl_bad_words) or + not bad_words.is_safe(query, nl_bad_words)): return _abort( 'The query was rejected due to the ' + 'presence of inappropriate words.', original_query, context_history) - query = str(escape(shared_utils.remove_punctuations(original_query))) - if not query: - return _abort('Received an empty query, please type a few words :)', - original_query, context_history) - counters = ctr.Counters() query_detection_debug_logs = {} query_detection_debug_logs["original_query"] = query diff --git a/shared/lib/utils.py b/shared/lib/utils.py index 93c1ed268b..07fdaea15f 100644 --- a/shared/lib/utils.py +++ b/shared/lib/utils.py @@ -110,9 +110,12 @@ def combine_stop_words() -> Set[str]: return stop_words -def remove_punctuations(s): +def remove_punctuations(s, include_comma=False): s = s.replace('\'s', '') - s = re.sub(r'[^\w\s]', ' ', s) + if include_comma: + s = re.sub(r'[^\w\s,]', ' ', s) + else: + s = re.sub(r'[^\w\s]', ' ', s) return " ".join(s.split()) diff --git a/static/js/apps/nl_interface/app.tsx b/static/js/apps/nl_interface/app.tsx index b9de6ee067..1d79e8ea6c 100644 --- a/static/js/apps/nl_interface/app.tsx +++ b/static/js/apps/nl_interface/app.tsx @@ -50,10 +50,10 @@ export function App(): JSX.Element { const [detector, setDetector] = useState( getUrlTokenOrDefault(NL_URL_PARAMS.DETECTOR, NL_DETECTOR_VALS.HEURISTIC) ); - const [placeDetector, setPlaceDetector] = useState( + const placeDetector = useRef( getUrlTokenOrDefault( NL_URL_PARAMS.PLACE_DETECTOR, - NL_PLACE_DETECTOR_VALS.NER + NL_PLACE_DETECTOR_VALS.DC ) ); const urlPrompts = useRef(getUrlPrompts()); @@ -190,7 +190,7 @@ export function App(): JSX.Element { query={q} indexType={indexType} detector={detector} - placeDetector={placeDetector} + placeDetector={placeDetector.current} contextHistory={getContextHistory(i)} addContextCallback={addContext} showData={false} @@ -221,10 +221,8 @@ export function App(): JSX.Element { }} indexType={indexType} detector={detector} - placeDetector={placeDetector} setIndexType={setIndexType} setDetector={setDetector} - setPlaceDetector={setPlaceDetector} /> {isStartState && } diff --git a/static/js/apps/nl_interface/nl_options.tsx b/static/js/apps/nl_interface/nl_options.tsx index 02a50c3d01..ddc2b814ab 100644 --- a/static/js/apps/nl_interface/nl_options.tsx +++ b/static/js/apps/nl_interface/nl_options.tsx @@ -32,8 +32,6 @@ interface NLOptionsPropType { setIndexType: (idx: string) => void; detector: string; setDetector: (v: string) => void; - placeDetector: string; - setPlaceDetector: (v: string) => void; } export function NLOptions(props: NLOptionsPropType): JSX.Element { @@ -109,35 +107,6 @@ export function NLOptions(props: NLOptionsPropType): JSX.Element { -
Place:
-
- - - - -
); } diff --git a/static/js/apps/nl_interface/query_search.tsx b/static/js/apps/nl_interface/query_search.tsx index 7873418480..696e6b70a2 100644 --- a/static/js/apps/nl_interface/query_search.tsx +++ b/static/js/apps/nl_interface/query_search.tsx @@ -32,8 +32,6 @@ interface QuerySearchPropType { setIndexType: (idx: string) => void; detector: string; setDetector: (v: string) => void; - placeDetector: string; - setPlaceDetector: (v: string) => void; } export function QuerySearch(props: QuerySearchPropType): JSX.Element { @@ -104,8 +102,6 @@ export function QuerySearch(props: QuerySearchPropType): JSX.Element { setIndexType={props.setIndexType} detector={props.detector} setDetector={props.setDetector} - placeDetector={props.placeDetector} - setPlaceDetector={props.setPlaceDetector} />