Merge pull request #37 from icy/curl

Curl
icy · Apr 13, 2020 · a8af509 · a8af509
2 parents 8033e5e + 1fc09d0
commit a8af509
Show file tree

Hide file tree

Showing 10 changed files with 74 additions and 196 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,5 +4,5 @@ language:
 script:
 - sudo apt-get install shellcheck
 - shellcheck *.sh
-- ( cd tests/ && openssl aes-256-cbc -K $encrypted_e3ddca67c2d3_key -iv $encrypted_e3ddca67c2d3_iv -in private-cookies.txt.enc -out private-cookies.txt -d ; )
+- ( cd tests/ && openssl aes-256-cbc -K $encrypted_4d6c5775c90a_key -iv $encrypted_4d6c5775c90a_iv -in curl-options.txt.enc -out curl-options.txt -d ;)
 - ./tests/tests.sh
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## v2.0.0
+
+* Using `curl` instead of `wget`
+* Fix #36 (unable to read cookie file)
+* Fix #34 (`413 Request Entity Too Large`)
+
 ## v1.2.2
 
 * Loop detection: #24.

diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Groups with adult contents haven't been supported yet.
 
 ## Installation
 
-The script requires `bash-4`, `sort`, `wget`, `sed`, `awk`.
+The script requires `bash-4`, `sort`, `curl`, `sed`, `awk`.
 
 Make the script executable with `chmod 755` and put them in your path
 (e.g, `/usr/local/bin/`.)
@@ -39,16 +39,16 @@ https://github.com/icy/google-group-crawler/issues/26.
 For private group, please
 [prepare your cookies file](#private-group-or-group-hosted-by-an-organization).
 
-    # export _WGET_OPTIONS="-v"       # use wget options to provide e.g, cookies
+    # export _CURL_OPTION="-v"        # use curl options to provide e.g, cookies
     # export _HOOK_FILE="/some/path"  # provide a hook file, see in #the-hook
 
     # export _ORG="your.company"      # required, if you are using Gsuite
     export _GROUP="mygroup"           # specify your group
     ./crawler.sh -sh                  # first run for testing
-    ./crawler.sh -sh > wget.sh        # save your script
-    bash wget.sh                      # downloading mbox files
+    ./crawler.sh -sh > curl.sh        # save your script
+    bash curl.sh                      # downloading mbox files
 
-You can execute `wget.sh` script multiple times, as `wget` will skip
+You can execute `curl.sh` script multiple times, as `curl` will skip
 quickly any fully downloaded files.
 
 ### Update your local archive thanks to RSS feed
@@ -66,32 +66,33 @@ It's useful to follow this way frequently to update your local archive.
 ### Private group or Group hosted by an organization
 
 To download messages from private group or group hosted by your organization,
-you need to provide cookies in legacy format.
-
-1. Export cookies for `google` domains from your browser and
-   save them as file. Please use a Netscape format, and you may want to
-   edit the file to meet a few conditions:
-
-   1. The first line should be `# Netscape HTTP Cookie File`
-   2. The file must use tab instead of space.
-   3. The first field of every line in the file must be `groups.google.com`.
-
-   A simple script to process this file is as below
-
-        $ cat original_cookies.txt \
-          | tail -n +3 \
-          | awk  -v OFS='\t' \
-            'BEGIN {printf("# Netscape HTTP Cookie File\n\n")}
-             {$1 = "groups.google.com"; printf("%s\n", $0)}'
-
-    See the sample files in the `tests/` directory
-
-    1. The original file: [tests/sample-original-cookies.txt](tests/sample-original-cookies.txt)
-    1. The fixed file: [tests/sample-fixed-cookies.txt](tests/sample-fixed-cookies.txt)
-
-2. Specify your cookie file by `_WGET_OPTIONS`:
-
-        export _WGET_OPTIONS="--load-cookies /your/path/fixed_cookies.txt --keep-session-cookies"
+you need to provide some cookie information to the script. In the past,
+the script uses `wget` and the Netscape cookie file format,
+now we are using `curl` with cookie string and a configuration file.
+
+0. Open Firefox, press F12 to enable Debug mode and select Network tab
+   from the Debug console of Firefox. (You may find a similar way for
+   your favorite browser.)
+1. Log in to your testing google account, and access your group.
+   For example
+     https://groups.google.com/forum/?_escaped_fragment_=categories/google-group-crawler-public
+   (replace `google-group-crawler-public` with your group name).
+   Make sure you can read some contents with your own group URI.
+2. Now from the Network tab in Debug console, select the address
+   and select `Copy -> Copy Request Headers`. You will have a lot of
+   things in the result, but please paste them in your text editor
+   and select only `Cookie` part.
+3. Now prepare a file `curl-options.txt` as below
+
+        user-agent = "Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0"
+        header = "Cookie: <snip>"
+
+   Of course, replace the `<snip>` part with your own cookie strings.
+   See `man curl` for more details of the file format.
+
+2. Specify your cookie file by `_CURL_OPTIONS`:
+
+        export _CURL_OPTIONS="-K /path/to/curl-options.txt"
 
    Now every hidden group can be downloaded :)
 
@@ -100,13 +101,13 @@ you need to provide cookies in legacy format.
 If you want to execute a `hook` command after a `mbox` file is downloaded,
 you can do as below.
 
-1. Prepare a Bash script file that contains a definition of `__wget_hook`
+1. Prepare a Bash script file that contains a definition of `__curl_hook`
    command. The first argument is to specify an output filename, and the
    second argument is to specify an URL. For example, here is simple hook
 
         # $1: output file
         # $2: url (https://groups.google.com/forum/message/raw?msg=foobar/topicID/msgID)
-        __wget_hook() {
+        __curl_hook() {
           if [[ "$(stat -c %b "$1")" == 0 ]]; then
             echo >&2 ":: Warning: empty output '$1'"
           fi
@@ -119,7 +120,7 @@ you can do as below.
    to your file. For example,
 
         export _GROUP=archlinuxvn
-        export _HOOK_FILE=$HOME/bin/wget.hook.sh
+        export _HOOK_FILE=$HOME/bin/curl.hook.sh
 
    Now the hook file will be loaded in your future output of commands
    `crawler.sh -sh` or `crawler.sh -rss`.

diff --git a/crawler.sh b/crawler.sh
@@ -60,10 +60,10 @@ _short_url() {
 
 _links_dump() {
   # shellcheck disable=2086
-  wget \
-    --user-agent="$_USER_AGENT" \
-    $_WGET_OPTIONS \
-    -O- "$@" \
+  curl \
+    --user-agent "$_USER_AGENT" \
+    $_CURL_OPTIONS \
+    -Lso- "$@" \
   | sed -e "s#['\"]#\\"$'\n#g' \
   | grep -E '^https?://' \
   | sort -u
@@ -107,14 +107,15 @@ _download_page() {
 
     # Loop detection. See also
     #   https://github.com/icy/google-group-crawler/issues/24
+    # FIXME: 2020/04: This isn't necessary after Google has changed something
     if [[ $__ -ge 1 ]]; then
       if diff "$_f_output" "$1.$(( __ - 1 ))" >/dev/null 2>&1; then
         echo >&2 ":: =================================================="
         echo >&2 ":: Loop detected. Your cookie may not work correctly."
         echo >&2 ":: You may want to generate new cookie file"
         echo >&2 ":: and/or remove all '#HttpOnly_' strings from it."
         echo >&2 ":: =================================================="
-        exit 1
+        exit 125
       fi
     fi
 
@@ -177,7 +178,7 @@ _main() {
   | sed -e 's#/d/msg/#/forum/message/raw?msg=#g' \
   | while read -r _url; do
       _id="$(echo "$_url"| sed -e "s#.*=$_GROUP/##g" -e 's#/#.#g')"
-      echo "__wget__ \"$_D_OUTPUT/mbox/m.${_id}\" \"$_url\""
+      echo "__curl__ \"$_D_OUTPUT/mbox/m.${_id}\" \"$_url\""
     done
 }
 
@@ -187,10 +188,10 @@ _rss() {
   {
     echo >&2 ":: Fetching RSS data..."
     # shellcheck disable=2086
-    wget \
-      --user-agent="$_USER_AGENT" \
-      $_WGET_OPTIONS \
-      -O- "https://groups.google.com${_ORG:+/a/$_ORG}/forum/feed/$_GROUP/msgs/rss.xml?num=${_RSS_NUM}"
+    curl \
+      --user-agent "$_USER_AGENT" \
+      $_CURL_OPTIONS \
+      -Lso- "https://groups.google.com${_ORG:+/a/$_ORG}/forum/feed/$_GROUP/msgs/rss.xml?num=${_RSS_NUM}"
   } \
   | grep '<link>' \
   | grep 'd/msg/' \
@@ -203,26 +204,26 @@ _rss() {
       _id_origin="$(sed -e "s#.*$_GROUP/##g" <<<"$_url")"
       _url="https://groups.google.com${_ORG:+/a/$_ORG}/forum/message/raw?msg=$_GROUP/$_id_origin"
       _id="${_id_origin//\//.}"
-      echo "__wget__ \"$_D_OUTPUT/mbox/m.${_id}\" \"$_url\""
+      echo "__curl__ \"$_D_OUTPUT/mbox/m.${_id}\" \"$_url\""
     done
 }
 
 # $1: Output File
 # $2: The URL
-__wget__() {
+__curl__() {
   if [[ ! -f "$1" ]]; then
     # shellcheck disable=2086
-    wget \
-      --user-agent="$_USER_AGENT" \
-      $_WGET_OPTIONS \
-      "$2" -O "$1"
-    __wget_hook "$1" "$2"
+    curl -Ls \
+      -A "$_USER_AGENT" \
+      $_CURL_OPTIONS \
+      "$2" -o "$1"
+    __curl_hook "$1" "$2"
   fi
 }
 
 # $1: Output File
 # $2: The URL
-__wget_hook() {
+__curl_hook() {
   :
 }
 
@@ -242,9 +243,9 @@ _ship_hook() {
   echo "export _GROUP=\"\${_GROUP:-$_GROUP}\""
   echo "export _D_OUTPUT=\"\${_D_OUTPUT:-$_D_OUTPUT}\""
   echo "export _USER_AGENT=\"\${_USER_AGENT:-$_USER_AGENT}\""
-  echo "export _WGET_OPTIONS=\"\${_WGET_OPTIONS:-$_WGET_OPTIONS}\""
+  echo "export _CURL_OPTIONS=\"\${_CURL_OPTIONS:-$_CURL_OPTIONS}\""
   echo ""
-  declare -f __wget_hook
+  declare -f __curl_hook
 
   if [[ -f "${_HOOK_FILE:-}" ]]; then
     declare -f __sourcing_hook
@@ -254,7 +255,7 @@ _ship_hook() {
     exit 1
   fi
 
-  declare -f __wget__
+  declare -f __curl__
 }
 
 _help() {
@@ -270,7 +271,7 @@ _has_command() {
 
 _check() {
   local _requirements=
-  _requirements="wget sort awk sed diff"
+  _requirements="curl sort awk sed diff"
   # shellcheck disable=2086
   _has_command $_requirements \
   || {
@@ -290,15 +291,14 @@ __main__() { :; }
 set -u
 
 _ORG="${_ORG:-}"
-_GROUP="${_GROUP,,}"
 _GROUP="${_GROUP:-}"
 _D_OUTPUT="${_D_OUTPUT:-./${_ORG:+${_ORG}-}${_GROUP}/}"
 # _GROUP="${_GROUP//+/%2B}"
 _USER_AGENT="${_USER_AGENT:-Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0}"
-_WGET_OPTIONS="${_WGET_OPTIONS:-}"
+_CURL_OPTIONS="${_CURL_OPTIONS:-}"
 _RSS_NUM="${_RSS_NUM:-50}"
 
-export _ORG _GROUP _D_OUTPUT _USER_AGENT _WGET_OPTIONS _RSS_NUM
+export _ORG _GROUP _D_OUTPUT _USER_AGENT _CURL_OPTIONS _RSS_NUM
 
 _check || exit
 

diff --git a/tests/curl-options.txt.enc b/tests/curl-options.txt.enc
diff --git a/tests/fix_cookies.sh b/tests/fix_cookies.sh
diff --git a/tests/private-cookies.txt.enc b/tests/private-cookies.txt.enc
diff --git a/tests/sample-fixed-cookies.txt b/tests/sample-fixed-cookies.txt