Harmonizing fetch behavior wrt variadicity

Fix #920 cc @diegantobass
medialab · Dec 8, 2023 · ed51d0d · ed51d0d
1 parent 68ed2c8
commit ed51d0d
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 14 deletions.
diff --git a/docs/cli.md b/docs/cli.md
@@ -618,13 +618,11 @@ Optional Arguments:
   --compress-transfer           Whether to send a "Accept-Encoding" header
                                 asking for a compressed response. Usually better
                                 for bandwidth but at the cost of more CPU work.
-  -c, --contents-in-report, -w, --no-contents-in-report
-                                Whether to include retrieved contents, e.g.
+  -c, --contents-in-report      Whether to include retrieved contents, e.g.
                                 html, directly in the report and avoid writing
                                 them in a separate folder. This requires to
                                 standardize encoding and won't work on binary
-                                formats. Note that --contents-in-report is the
-                                default when no input file is given.
+                                formats.
   --domain-parallelism DOMAIN_PARALLELISM
                                 Max number of urls per domain to hit at the same
                                 time. Defaults to `1`.
@@ -770,6 +768,9 @@ Examples:
 . Fetching a batch of url from existing CSV file:
     $ minet fetch url -i file.csv > report.csv
 
+. Piping to minet extract:
+    $ minet fetch url -i file.csv -c | minet extract -i -
+
 . CSV input from stdin (mind the `-`):
     $ xsv select url file.csv | minet fetch url -i - > report.csv
 

diff --git a/ftest/ftest-array.sh b/ftest/ftest-array.sh
@@ -22,7 +22,7 @@ $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml name -i $EXTRACT_DIR/articles.csv -I
 echo "  - CSV bodies"
 $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml -i $EXTRACT_DIR/bodies.csv --body-column html | wc -l
 echo "  - Piping fetch"
-$MINET fetch https://github.com/medialab/minet | $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml -i - | wc -l
+$MINET fetch https://github.com/medialab/minet -c | $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml -i - | wc -l
 echo
 
 echo "Extract"
@@ -35,15 +35,15 @@ $MINET extract -p 1 name -i $EXTRACT_DIR/articles.csv -I $EXTRACT_DIR | wc -l
 echo "  - CSV bodies"
 $MINET extract -p 1 -i $EXTRACT_DIR/bodies.csv --body-column html | wc -l
 echo "  - Piping fetch"
-$MINET fetch https://github.com/medialab/minet | $MINET extract -p 1 -i - | wc -l
+$MINET fetch https://github.com/medialab/minet -c | $MINET extract -p 1 -i - | wc -l
 echo
 
 echo "Resolve"
 $MINET resolve https://medialab.sciencespo.fr/ | grep hit
 echo
 
 echo "Url Extract"
-$MINET fetch https://news.ycombinator.com/ | $MINET url-extract body - --from html | wc -l
+$MINET fetch https://news.ycombinator.com/ -c | $MINET url-extract body - --from html | wc -l
 echo
 
 echo "Url Join"

diff --git a/minet/cli/fetch/__init__.py b/minet/cli/fetch/__init__.py
@@ -99,10 +99,6 @@
 
 
 def resolve_fetch_arguments(cli_args):
-    # If we are hitting a single url we enable contents_in_report by default
-    if cli_args.has_dummy_csv and cli_args.contents_in_report is None:
-        cli_args.contents_in_report = True
-
     if cli_args.dont_save:
         cli_args.contents_in_report = False
 
@@ -155,6 +151,9 @@ def resolve_fetch_arguments(cli_args):
         . Fetching a batch of url from existing CSV file:
             $ minet fetch url -i file.csv > report.csv
 
+        . Piping to minet extract:
+            $ minet fetch url -i file.csv -c | minet extract -i -
+
         . CSV input from stdin (mind the `-`):
             $ xsv select url file.csv | minet fetch url -i - > report.csv
 
@@ -189,10 +188,10 @@ def resolve_fetch_arguments(cli_args):
             "action": "store_true",
         },
         {
-            "flags": ["-c", "--contents-in-report", "-w", "--no-contents-in-report"],
-            "help": "Whether to include retrieved contents, e.g. html, directly in the report and avoid writing them in a separate folder. This requires to standardize encoding and won't work on binary formats. Note that --contents-in-report is the default when no input file is given.",
+            "flags": ["-c", "--contents-in-report"],
+            "help": "Whether to include retrieved contents, e.g. html, directly in the report and avoid writing them in a separate folder. This requires to standardize encoding and won't work on binary formats.",
             "dest": "contents_in_report",
-            "action": BooleanAction,
+            "action": "store_true",
         },
         {
             "flags": ["-D", "--dont-save"],