From e0a7f7eee1f15a0d5c1ccc5192a62a184136c903 Mon Sep 17 00:00:00 2001 From: Ky-Anh Huynh Date: Mon, 13 Apr 2020 07:37:47 +0200 Subject: [PATCH 1/2] Using curl. Fix #36, #34 --- .travis.yml | 2 +- README.md | 58 +++++++++++++++--------------- crawler.sh | 50 +++++++++++++------------- tests/curl-options.txt.enc | Bin 0 -> 1008 bytes tests/fix_cookies.sh | 15 -------- tests/private-cookies.txt.enc | Bin 8976 -> 0 bytes tests/sample-fixed-cookies.txt | 57 ----------------------------- tests/sample-original-cookies.txt | 57 ----------------------------- tests/tests.sh | 14 ++++---- 9 files changed, 62 insertions(+), 191 deletions(-) create mode 100644 tests/curl-options.txt.enc delete mode 100755 tests/fix_cookies.sh delete mode 100644 tests/private-cookies.txt.enc delete mode 100644 tests/sample-fixed-cookies.txt delete mode 100644 tests/sample-original-cookies.txt diff --git a/.travis.yml b/.travis.yml index ffe91ad..88538fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,5 +4,5 @@ language: script: - sudo apt-get install shellcheck - shellcheck *.sh -- ( cd tests/ && openssl aes-256-cbc -K $encrypted_e3ddca67c2d3_key -iv $encrypted_e3ddca67c2d3_iv -in private-cookies.txt.enc -out private-cookies.txt -d ; ) +- ( cd tests/ && openssl aes-256-cbc -K $encrypted_4d6c5775c90a_key -iv $encrypted_4d6c5775c90a_iv -in curl-options.txt.enc -out curl-options.txt -d ;) - ./tests/tests.sh diff --git a/README.md b/README.md index 78ba0f1..da5ea81 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Groups with adult contents haven't been supported yet. ## Installation -The script requires `bash-4`, `sort`, `wget`, `sed`, `awk`. +The script requires `bash-4`, `sort`, `curl`, `sed`, `awk`. Make the script executable with `chmod 755` and put them in your path (e.g, `/usr/local/bin/`.) @@ -39,16 +39,16 @@ https://github.com/icy/google-group-crawler/issues/26. For private group, please [prepare your cookies file](#private-group-or-group-hosted-by-an-organization). - # export _WGET_OPTIONS="-v" # use wget options to provide e.g, cookies + # export _CURL_OPTION="-v" # use curl options to provide e.g, cookies # export _HOOK_FILE="/some/path" # provide a hook file, see in #the-hook # export _ORG="your.company" # required, if you are using Gsuite export _GROUP="mygroup" # specify your group ./crawler.sh -sh # first run for testing - ./crawler.sh -sh > wget.sh # save your script - bash wget.sh # downloading mbox files + ./crawler.sh -sh > curl.sh # save your script + bash curl.sh # downloading mbox files -You can execute `wget.sh` script multiple times, as `wget` will skip +You can execute `curl.sh` script multiple times, as `curl` will skip quickly any fully downloaded files. ### Update your local archive thanks to RSS feed @@ -66,32 +66,32 @@ It's useful to follow this way frequently to update your local archive. ### Private group or Group hosted by an organization To download messages from private group or group hosted by your organization, -you need to provide cookies in legacy format. +you need to provide some cookie information to the script. In the past, +the script uses `wget` and the Netscape cookie file format, +now we are using `curl` with cookie string and a configuration file. -1. Export cookies for `google` domains from your browser and - save them as file. Please use a Netscape format, and you may want to - edit the file to meet a few conditions: +0. Open Firefox, press F12 to enable Debug mode and select Network tab + from the Debug console of Firefox. (You may find a similar way for + your favorite browser.) +1. Log in to your testing google account, and access your group. + For example + https://groups.google.com/forum/?_escaped_fragment_=categories/google-group-crawler-public + (replace `google-group-crawler-public` with your group name). + Make sure you can read some contents with your own group URI. +2. Now from the Network tab in Debug console, select the address + and select `Copy -> Copy Request Headers`. You will have a lot of + things in the result, but please paste them in your text editor + and select only `Cookie` part. +3. Now prepare a file `curl-options.txt` as below - 1. The first line should be `# Netscape HTTP Cookie File` - 2. The file must use tab instead of space. - 3. The first field of every line in the file must be `groups.google.com`. + user-agent = "Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0" + header = "Cookie: " - A simple script to process this file is as below + Of course, replace the `` part with your own cookie strings - $ cat original_cookies.txt \ - | tail -n +3 \ - | awk -v OFS='\t' \ - 'BEGIN {printf("# Netscape HTTP Cookie File\n\n")} - {$1 = "groups.google.com"; printf("%s\n", $0)}' +2. Specify your cookie file by `_CURL_OPTIONS`: - See the sample files in the `tests/` directory - - 1. The original file: [tests/sample-original-cookies.txt](tests/sample-original-cookies.txt) - 1. The fixed file: [tests/sample-fixed-cookies.txt](tests/sample-fixed-cookies.txt) - -2. Specify your cookie file by `_WGET_OPTIONS`: - - export _WGET_OPTIONS="--load-cookies /your/path/fixed_cookies.txt --keep-session-cookies" + export _CURL_OPTIONS="-K /path/to/curl-options.txt" Now every hidden group can be downloaded :) @@ -100,13 +100,13 @@ you need to provide cookies in legacy format. If you want to execute a `hook` command after a `mbox` file is downloaded, you can do as below. -1. Prepare a Bash script file that contains a definition of `__wget_hook` +1. Prepare a Bash script file that contains a definition of `__curl_hook` command. The first argument is to specify an output filename, and the second argument is to specify an URL. For example, here is simple hook # $1: output file # $2: url (https://groups.google.com/forum/message/raw?msg=foobar/topicID/msgID) - __wget_hook() { + __curl_hook() { if [[ "$(stat -c %b "$1")" == 0 ]]; then echo >&2 ":: Warning: empty output '$1'" fi @@ -119,7 +119,7 @@ you can do as below. to your file. For example, export _GROUP=archlinuxvn - export _HOOK_FILE=$HOME/bin/wget.hook.sh + export _HOOK_FILE=$HOME/bin/curl.hook.sh Now the hook file will be loaded in your future output of commands `crawler.sh -sh` or `crawler.sh -rss`. diff --git a/crawler.sh b/crawler.sh index 443b851..cb193c4 100755 --- a/crawler.sh +++ b/crawler.sh @@ -60,10 +60,10 @@ _short_url() { _links_dump() { # shellcheck disable=2086 - wget \ - --user-agent="$_USER_AGENT" \ - $_WGET_OPTIONS \ - -O- "$@" \ + curl \ + --user-agent "$_USER_AGENT" \ + $_CURL_OPTIONS \ + -Lso- "$@" \ | sed -e "s#['\"]#\\"$'\n#g' \ | grep -E '^https?://' \ | sort -u @@ -107,6 +107,7 @@ _download_page() { # Loop detection. See also # https://github.com/icy/google-group-crawler/issues/24 + # FIXME: 2020/04: This isn't necessary after Google has changed something if [[ $__ -ge 1 ]]; then if diff "$_f_output" "$1.$(( __ - 1 ))" >/dev/null 2>&1; then echo >&2 ":: ==================================================" @@ -114,7 +115,7 @@ _download_page() { echo >&2 ":: You may want to generate new cookie file" echo >&2 ":: and/or remove all '#HttpOnly_' strings from it." echo >&2 ":: ==================================================" - exit 1 + exit 125 fi fi @@ -177,7 +178,7 @@ _main() { | sed -e 's#/d/msg/#/forum/message/raw?msg=#g' \ | while read -r _url; do _id="$(echo "$_url"| sed -e "s#.*=$_GROUP/##g" -e 's#/#.#g')" - echo "__wget__ \"$_D_OUTPUT/mbox/m.${_id}\" \"$_url\"" + echo "__curl__ \"$_D_OUTPUT/mbox/m.${_id}\" \"$_url\"" done } @@ -187,10 +188,10 @@ _rss() { { echo >&2 ":: Fetching RSS data..." # shellcheck disable=2086 - wget \ - --user-agent="$_USER_AGENT" \ - $_WGET_OPTIONS \ - -O- "https://groups.google.com${_ORG:+/a/$_ORG}/forum/feed/$_GROUP/msgs/rss.xml?num=${_RSS_NUM}" + curl \ + --user-agent "$_USER_AGENT" \ + $_CURL_OPTIONS \ + -Lso- "https://groups.google.com${_ORG:+/a/$_ORG}/forum/feed/$_GROUP/msgs/rss.xml?num=${_RSS_NUM}" } \ | grep '' \ | grep 'd/msg/' \ @@ -203,26 +204,26 @@ _rss() { _id_origin="$(sed -e "s#.*$_GROUP/##g" <<<"$_url")" _url="https://groups.google.com${_ORG:+/a/$_ORG}/forum/message/raw?msg=$_GROUP/$_id_origin" _id="${_id_origin//\//.}" - echo "__wget__ \"$_D_OUTPUT/mbox/m.${_id}\" \"$_url\"" + echo "__curl__ \"$_D_OUTPUT/mbox/m.${_id}\" \"$_url\"" done } # $1: Output File # $2: The URL -__wget__() { +__curl__() { if [[ ! -f "$1" ]]; then # shellcheck disable=2086 - wget \ - --user-agent="$_USER_AGENT" \ - $_WGET_OPTIONS \ - "$2" -O "$1" - __wget_hook "$1" "$2" + curl -Ls \ + -A "$_USER_AGENT" \ + $_CURL_OPTIONS \ + "$2" -o "$1" + __curl_hook "$1" "$2" fi } # $1: Output File # $2: The URL -__wget_hook() { +__curl_hook() { : } @@ -242,9 +243,9 @@ _ship_hook() { echo "export _GROUP=\"\${_GROUP:-$_GROUP}\"" echo "export _D_OUTPUT=\"\${_D_OUTPUT:-$_D_OUTPUT}\"" echo "export _USER_AGENT=\"\${_USER_AGENT:-$_USER_AGENT}\"" - echo "export _WGET_OPTIONS=\"\${_WGET_OPTIONS:-$_WGET_OPTIONS}\"" + echo "export _CURL_OPTIONS=\"\${_CURL_OPTIONS:-$_CURL_OPTIONS}\"" echo "" - declare -f __wget_hook + declare -f __curl_hook if [[ -f "${_HOOK_FILE:-}" ]]; then declare -f __sourcing_hook @@ -254,7 +255,7 @@ _ship_hook() { exit 1 fi - declare -f __wget__ + declare -f __curl__ } _help() { @@ -270,7 +271,7 @@ _has_command() { _check() { local _requirements= - _requirements="wget sort awk sed diff" + _requirements="curl sort awk sed diff" # shellcheck disable=2086 _has_command $_requirements \ || { @@ -290,15 +291,14 @@ __main__() { :; } set -u _ORG="${_ORG:-}" -_GROUP="${_GROUP,,}" _GROUP="${_GROUP:-}" _D_OUTPUT="${_D_OUTPUT:-./${_ORG:+${_ORG}-}${_GROUP}/}" # _GROUP="${_GROUP//+/%2B}" _USER_AGENT="${_USER_AGENT:-Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0}" -_WGET_OPTIONS="${_WGET_OPTIONS:-}" +_CURL_OPTIONS="${_CURL_OPTIONS:-}" _RSS_NUM="${_RSS_NUM:-50}" -export _ORG _GROUP _D_OUTPUT _USER_AGENT _WGET_OPTIONS _RSS_NUM +export _ORG _GROUP _D_OUTPUT _USER_AGENT _CURL_OPTIONS _RSS_NUM _check || exit diff --git a/tests/curl-options.txt.enc b/tests/curl-options.txt.enc new file mode 100644 index 0000000000000000000000000000000000000000..0e424a77bd15af8b642d14697acd43ad91b63f8f GIT binary patch literal 1008 zcmV9>|U(U>?8bDiZ|9D`s^hLyQ@{)?zg~q-nE@CY{QWaAV^wn{K1q6 zH978o#>!YXJB55h?uEe)4j;69cUU1q7+U3ZiA1)GuC2y_+(?9KQ!yg zRX<^uROWQ5zG0?&JK(L?9T?y%8X`Dx~MQj zxI5ltHef+b@!g+GSTo;&lWeEx-3{}r>unj_a|RlOr&k2`(wUZ0YI{*U-s@>>4-dXB zC34xB>1ppd69P;=#f{Yqc8&ZIzC6&xQ4Ek#?Iclyi+Y4zc5ozAV!-GQq7=yX(br>ayP%)Ct5QLbbE`n^t`!qspttrCX-ac9l#=@%btg0s!K2cZ zZeyHee%B(r9ASo~D!nCq@jt8Y=6ua_2s6|J4Maq)xSS zA}I)q>g`hCbScX-l~kLX4&$c?4MY_6y5Xemm#dC;_RD0Dgh^X_oV)*QCc z)B{2+98evDOicvsJuIvARr$f#iH*#PkX!kQvcJTOX*VH)=Sz3TrS&oEwxDEK9V>7C zROlOz4f4+M+S+d^*NMuQzNZ*O0VyY&heb>0{D_*>lF{%+r{U-D=Z7-ht;=xs&*Ung zGt?mlzm`tiV5Kk1_DToGcsLyEkBlUvl8Swq<hZ z?pP73XIpvvDI6wtNHVpzvun$dpRSJF7D6|d%U7SnnqmOGA z6R^)b;EQjP7&3jJpR8^sGOV>@dfMpAo|6cD5f-0VlxNNbXuGPIhTk5nKpBb_@f5I8 zN>IZ$#;tUM3{Y?I5aJ$0!+Wn5YJBeefHw~a&|2d9+10B=M9&=)6s{0URoo9ZB1cR9 e@zv>{De(;~3RH%Wu6>%_{!L+)fmkA?FzG*$Pyvhp literal 0 HcmV?d00001 diff --git a/tests/fix_cookies.sh b/tests/fix_cookies.sh deleted file mode 100755 index 83cb068..0000000 --- a/tests/fix_cookies.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -# Author : Ky-Anh Huynh -# License : MIT - -_file="${1:-}" -if [[ -z "${_file}" || ! -f "$_file" ]]; then - echo >&2 ":: Missing original cookie file as the first argument." - exit 127 -fi - -< "$_file" tail -n +3 \ -| awk -v OFS='\t' \ - 'BEGIN {printf("# Netscape HTTP Cookie File\n\n")} - {$1 = "groups.google.com"; printf("%s\n", $0)}' diff --git a/tests/private-cookies.txt.enc b/tests/private-cookies.txt.enc deleted file mode 100644 index d992a7a33447119d3bb5896dac0531842f7e4ec4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8976 zcmV+rBk$ZX4f13DZy0$h8wH20Ob#5Ry`I=<#uj#er@1(nD3 zJ+i4!9+s<9AZ7%C>`^EG(+1N4!O6@DNeL`uFbRQOw42&2V1d&9%`TkgX@OZ zZWDRc^|dd!6qb`vj?2VZ)_+3>-|RCWapE}4AI4lk@EE!JY-376UaAR)URI+wpnXkd z0bGXI!~tvl(cLMa=LZj@w2Cl_(HS_PR<+($)xc92k|jEQ7R(*|m4Sbw7tOu}{AI+< ztmlg5`YkWQSLR~z`%V<3P7>{3%g7E*a*L|3zEB%v49>}*)MFNV?FZ&29MCP2nc}B% z+U*IRO`gdnC$cMmr5D90a!>6~ruA*t0*6CK^JPQmf7HX~>x>n|tvFr}frb|6gP;G| z01KI*BC0PmGdIf%GmVFjP@ey&9<~3q{r85Vj0M|Az|r1B{1wLPA6π8qjG?#qzv zpOzkw-Im9JHa*QqVS=qhSQO2ljg|5$=91q68ARwZbr{csc*$EQm$xEbc_NN`cnlZr z-?fJ=x-*tjC+mpMDQnDH-0R93q)7(_@9vSiFmn;=FgDc`1<~*yM#61|lo60g*5Gg( zj+{^B=59&^(&BY#gqA+`kUEg8(({(5i(;n(K!w^$SD---j?KrUyiYGO-UsnG*tj*Ay7&P_^mFys+ zQF(=W(|9Ox7p7z9Z`R6fzOTHNJ1j6Bz{}!lC=@(&DbJmVbsaAEQ^O29H(HW14AXHn zzKBuW9&vNQm}C6Z|4UABZ4phc+Gm7DOqkMfe5KpnV_w)mkfD#=}DJQc5NHE?m7^d zp-2i#;-@847O6>8X0wH*={h_fQ}cp{H? zjr*b_oIl1WW7>dPynsjTnoW~pP(YP8sHj|2y5j~J;;yoNpO6B1>G`Sz!5+&|>7I?L zVr0&M{!jGwCkk?4u}}q&xPFCTq3b)DzL&lzIv3}ZQ$8CJeAHbn=Cq3`tof-TzKF(C zJ-GrO;%LtL>1WrXkL z_kO~7lFJc-%asZ~P#>*e1>llLnMQP_JsN?S$P9#)=H#x!HiH*|@BB zl5#HdKerheM)4pFh7Wa9?5qpu+gKt!@!{s#ki?!!$0%)=RFR7bW6}!`z~2Y^i!)u! zKOE*luNJY6Tu~4F0yf*X>!Gu_e88biEk$~U+ejHCyIlc|H^!A%F>6ACs)E&EU{xl+ z3gEcSjD!4ye6-UdgD(;j~8N-|vET7#W5!su6BXq<#Cti33)puF%ZC|Sl#p4#H z2UlUMx}8EoLL66Hy=%oK;&y=^i;tZg1+{#x#?})_VQY8N(XcVf7@#y{KZMu<#_Nt| zca6iT*muqJ6lZZa+gE1(IHq;x-nBnj#H0a04pgO?W3m}}5r2%tJT*&yXK=QeC-a%hKfo8@}WfznUq?RH{RCNx}pDY=Yp-^ua zI=?4KYA++TcVEKcEuqQp5}Tjfx7rKfbZUohthX8Ad2@^w3B0qpH|bVb{Q>TV-Nbou zx~NZ7Bjs-sM4OvxHWxq#P@Z8u3xDk@AseJf*09roc=vqjTuDhK=&?UUg#%Fi)!tFx zHtktDIf^jX-UeIA7X>fN((Wozfue^at! zgyA=1ks%d{lM3}ogbyy?-_A}$L9b<+aTlB72Y=Cj<5Xn)6)g%HTt*b*&wLd~VL^UE z1@XFrU8;@9f)bxD{kwy~Pvm8Eu)W>P95>pI{Hx0@J{fg^{&2U+lUe}jCu)j+lIBCg z>uA|+hD{tANSN*h$VQ(#J%k2)p?i!}Aoge&ixD|Gw@SBDc#U+db^WO9;`HRbo>YDI zOCm*?pE|T5W#9rAYqzY?3JKvpMhC4##mT=Dsq5O|p+~5w|du1dUr}vhJ6!?gH$pp4IeOJlmrxTz=^h3y0>_CU^35&JT zKjtwZODx|)PU28y(xflaugL=8H&qKms8eL~D}qNPns zWMn)W@tZ|yx}vQu5kb)KWDF+cB+CT{7xMP`hS}46lumF<`Zr8LTjd<&_CuPhrrWH6 z95>lGX2G)nt{rbuxNV-Z0@KtGA{rABL%@g7CqaT&`rjnjQp85-?*J`e{K%#ulmy<^ zH&p%j|FPnUlL`CT)E0b#y;JG|-sdiI*huLHwuku=$O$sPhIfouVAe^xx7l5HCStM< zZN(=eY?c&LznnEqEd-%-u=kj+A%A4HIAY@G>%?@X5NO%85TVsIf$)~%SMN^Dr>F#Z zKNanL4Hd^|eH_Sf%b#XuRfmoYdI`8f1ZPwk6o%ks_`1Xd#A%js&6Yni7eOf zI1iLjFHkDd*pa_du{38k147;qK$T=sDgXqc#N^TRShK|v+^Tqt`jL$&LnctT3p*$6 z{$G3U4ZIS?W4DWK`*i1;RjDB~ODg!hZn5kc`jY_N?smPZ$|uzrFGNZbZDn#-Vc^KkYLuhRU zzJNM1ZvOfOwXjUT_(gJ9W?q#`av1AGVQqxYrtx8#3rarfOW-Di`MQ}Tmr-?^_^cwU;BTL1>3*c z7yZe_+A`+QAec*&q0B~Y{i1WAzI065P26;%E!2^K==7?*XD@c&{3liyfkj7D4iQ41 zuzvt#ltUOR6SR55VSvZ>KHJzFhKr?J;w&u@%2xRO_U!;INt}<^uA;^JXC8Z;ghJw? z;>S`t|C;u4OIh>+t4$?)0e)E&q2GF)T=MK~HHZ-jg8jJOtTS8hPfz zj401w)oFac{fWw;^?|NWYu4w*D-N742{R=ZNB%o?$UClR1C%}+>h8-;Id7=1kfmON zxt)1g43m-7yPU|5{ML9sdkC_k{A*Q+K-H;jo*I33XYvVr>88W?B{N4`>o1~5P^DPR zxPh3L%ghwv_d7-EXZNn~Wj?H_pKWx121Y$f+d&mGI)$t*4t$2id4WI$$XELJeuL!&IW%_mReEU9!j2;zEcPNr3paBly? zUC44nHD>0qhbDkDz)O`tuc0z4*`u6)Fu&}LR%$+YAI$Oy5>lC^RMN6I{#C5Nk&DhA zkz95CvFy|Nj$Q!)%4C1>LsTl5(zDHquTNE`9QhmD=JuP3cz|Sa6+x@F!=I|qEjs~F?9FvJeLf-H?xPozXx?4Jw9j-_Djw4uBEoYMTm zuyQ0P?%woF+2k{9RHVvKzY=dW+cnjz+%b7~$9W}ALP#V6z;P9DuuVgKwvh~_@LW7e zNF7GrQn%?|n8-+klga*jUG#V=g-|ppIe?5ndK}3`J_ZO&Cu5{h{`8OWNRjwt&9#6= z`=UEXtN;6wJPU-IsB9XmNm!99c&<-2WPol_t%*WO=AjC3wx+dx@Lz4mc-o~*ng(K} zSSxqdSiCN+>nr~w9C+KtN6Nr_#Y|qoI2{iMfx4Ce3bpZLzuqlo(ip?;ZHQK~4d7aJ z6vGq-10!SuR`*vLBN2=iMAp8o6Cl*3o@N`wXZo&@=wTl)J^DUcz{H-eXG47vIijM38C`Bw1)95Fic~ej4po_HjJs(!j<5F zxt)qV2lLkOQbZ47J!@#Xn*@zI9vu$45QUqVDAv;h6)p#JKA_xpa}<6YcuDncEk!9q7I|^!^I*H#f49-k#?*k0fiMkfru}H0sk&h1}Sfp z^j5;GpKmhD1Zk5=_hTh-Gd_0y)(OTRG|anP4C~WFUjYL~g&eK%6q+Qt2hF;x!*Y@Q zx|P-F*K;(_(AJM;Yg+iszv0g|=r_UBuG-JK+IY@Vbcv(raFuB3$tpY?PI&hVb$4qT zMDk(&sjl2# z9-hyliV;_khx&J%H;KwyhQ%OKMFq<)B9;(3N&SrfDM=uUv@Ssf10BHz;%27>0MPg; zwM@=jGAk~mj6jbkrqeMKbk)qWqi*cf6e*B#NwC>I$3hB*lc1I|PBJX8y;8oqwy|Jh zfK||nTZG3#75Yf@g4YR+f*wQjKRfP2djQ#0^&8T(Uo;Lwu?X_4)0$K5w9D^#*A=J7 zmz4*?G}z>pu^cL9QrDD(jg_i81YLSu%t}AjH_V;SIbx1F7K-q0E*9+K;n!l-FTx`L z0tw^houbIWJVsdjUz2Z1f@Um-oXn)Q$&1Pgs9B0ir0T&7TfVMotVoZ$9o>S;hU5=HGj_4z4B)WGk| zl6l(+YOEFD-aLB@vCOw4{*I90>F2P*8#3lm;)&5=FSd}>spGfQFN zsiGIuFeQvWA9M6=VDaBTJ$`4J>{zrZRi!zslQ&|WHK3AawBp^T0eBm)(Mx#TU=uvd z+UUF#7aM{&pXFxlIR1X;VPeM2m9V|agMCfRiCi+iy9-<`2Ic8Rj&WC{<1GKT7J+zA z@CdjfM|K@g?wUV%{jE#z0~pjrtC~k+c@JRaD_id!`)7!kLi&3V%2B1bRyCV}0$Z;5 zhtioj6iiHB$!V-Yh9n8Yu~du=Au36*ZC9?iogwYWQ}3(aytd4yZU>|Hsl~GH&E&T2 z2}KS7x=w-wOWd@0vux*)YG692U5kQ+;~TR`4JflC_z7AF1mVBY2*BreNx$&-E4mR> zo7!p-2%5Nh+$J7@MIWHL3r*M-{Lg+?lveqZD-3>F5UYVU3*u&yI(e~a2qh6nm|iXt z$gJYH$~{XU*$U4RlxYu`I);p;nN_&J zmyj>=_^|&=y_|jYM3Cl@jV>)#csWe5CF-(cY-VWhUwHRRstK}{osz6uBh7$0I9bu* zu(evcKQAtWoh45UQac>ImuKAhVB;Vzt1#-Cnzv^nIL+M?WL}LNd_4>srlJN) zEgE9Z$&IX-cqutsXa|u8<9C95*$=w&!cD&NVXne4!nG;+a!EI0brim*fvg7PmWU6( z2~&2R1g|uIgS??pd)j3Qb?I(V{`wi zJQ+N+Xq555)+HI~Ff_>cxx{72-K*I*E}a-kTy4%X(~4iVRt_Fn=W41StMXnY!M`Me z@2@%ZOt5>g#y6kPwyD^;9%p@(ig7uMu7`;Q|E!uhr2>g(o-h+On3tBQ)wlVm+XFrq z=cP|!w^t$p5BPgv6%ad_%7U@cm`$klyQ>V%Sm6U9DIx)h0MON9=?Q}Eo4W$*pEGmNrjdEuez;Q0yf5EOfXgo9v&a%A7kIL+kibfb zijdNv-~Eyks6?uXs^jh7$2Wb)HLa32sn^t`rR{|L+|l&YC>xH;?rLJc8zsly%m~j1 zDnXoTfldO29DY&JpetL&65*^_2Ji_0iTPtOuV&708*WwB6T{C0*?juq47#!VhN3yz zs$?uW#qqXvL>KVF>dcme6_7mAkvN&*RB(@W>pIv>S>6NV@$FU!`FGWR(u%2Rv`~BO zlg<~40A~TX=&-AWT!4_%_Q05QE}oZz>?cxxi{gYHCH%^%zG6tuJG!ty%55BTTlNR( zKAS#Qu@{Gn!qe&io20-L75zrDjzkFU6IdF$Yh@qXZ1=ZWlxbdY%co8J>}oW*%-9iX zD}}_LgRcF%X)!^+SesVIL~d^xmXc6b#T;tIgNPT^(mtdp;>|s%gX`#IuksDNkj$^C zD2v`gM74mSZ=L(U6ssDEnOhl~nJ$_QcJl_;%V`rZ700J}ofgUU|9eA3&1`@*t3X+#DIJb56Cz6J>qzFh*zAMg zWDJ4P(0h8QxbK<|<%*XlLwN{VY^q5#O__&Opo!N$HLN?tW>{TwfUNm<=-yZz_VeIoxD`zsx9o`-VedT-nTYZ< zQrCD2(WkJro>bz4k^>J;jPkYlbj)TqpYu>U)fq5iN0G)EKtN_2OHbS>5kYf~h15yv zMj&nvTSt>`>>hKeta)^Hpl@qc`De#?S$;NofXY;E@dOLrHFEdTU{s1z>+Q7IOgUTV z&Z2KDVOcQ4ox43kk9?nve_|;tB{W3AbVLXSzsCXIdFWaC?p;kxso0CfD#xyTzBle3 z`T&tFg5S?UARvL|aF>rzonXud&IVfV=?AB02X&Aep+wUiojm+#gs zl)U^p6-68VqN7T!uW=QLS_T>U8gZNxGY#esb4H?de`^RG-1 zAp?1dZ_F*jd~G;jQ+ClU);+?n6tR)pyxH)wf+ib#94snY;7hulv(R_~-s2D6_(iFB zb75?+4&HRhs~UGk(SQLjfiX5OK!-D6!bI@pQ+<;v>ekpE+z^AcA9K&Xw6~ zLRWGPoQwXf5aK%Z#st}w>wBHI5`ay%I0qc5)b=QtR`XKcF_Cl-j?w?P(Gs08p^FMX z`QV;v@MVn*%RSMQk(4wTDthk>_i9cfdPuZczt|`HprTgHF{L5&d@Em-UOj;OK|gDl zGo2AOAXS;BolHRZ^+kdiOu11Ax4AMgKXx$U!VN9$Di= z$}$8u8>*60*gb+_c%dHL=CNgbn|k`m0tvXK4@v?7NYnSD`y0B3=k)0f{sSw`oj>7v zw791NPig?pyt>3Jppun2Y+~^?-?O*&jJbfmK>?sV3c){G)Ku6UaMQj`9Ggy2_*L%u z%e&h*5LId}sk2iABs(0ifIZ=MIbi2bOTV`V-txd-5GmRb+!tosK`8o4cFNa?`gG&D)HD~_kH@cqeA)GQE*mEL1RYP5@z6_D~q$S^@q0sUNw0Bjvy$-_)Zl=E##fz9?{!h0-k||gmNM)HS zi+|U^s+I0b7_F?%R+V^s%ERtNWMtC5Hy{LyZfElX4B6J+pM?otY>au@q)>pK1}QM? z!#WgR9GV--1!F!UkW%ezxm6HAxq^QVy_D$MdU4y(1os}}!0*JP5?kOWZ26LN>&eZ0i-u~nt{SlW%Kmfj2esMobuZwYQfT=c%Kn*k( zO$uy4`i9bDmApN+Hhr1~dQuTek|WX^n(t!`3ApN6D{P}mgG(7{6kw5J@4Y$p12svG z-`7}pj7u-`g6ZG(RlCvQZLSX_LB(;oL#lmccCN)v=#XlI0vOL&0m5TX=sI~-@1N>vF zwpSY4KhV~Crd+g;J))juw54oy&HEc;y+ELeLvR}#sr%&{eHBK*f-D8B%2@tgIUcGY z?D3nz!U#NP_@D6x}Er|vwa2P~3 z#xdjICcT3HN!4~Y&?08clB61ziQR^Hvudn?w}%uD=QOfskHv@&PBp`&<5Ou^Kuy#> z>>?Ggz-w=yY7ZjIl2oxR^_CeJaT#BLqYDsIiLmjl(#HCm>YTXK`@9T%X3oJz*EbRq zCTsG@B(c0gSZy`XA&6Wn*XDaW&@sAtAfOx~zm0gnnN`*A&wi@t4Tb7IvM2A&XnU)%JQJxA{;#S?cHqYR>UK00`@l(hY-z|Zq zLb`j0$%Bv5bw%CDb^WE|X-}r&k{sJh8>ir;?nx$fS@S|2A8%^&%IMpo%8Gbl$A~({ zQMZ#e@rIvjRa;Q4J|WixP1sGus9QV#_$zlfAu?8A*N%Vs_00LEAZaV1Y!NSLMc%%HH=3y3~g%TlJPqXIU4bm`}y~ zy6g7W+q>hpb0jJ@$uqH9!8=P$wg&|$oR(sb-f(^E1F+(alVfaHPiYN2oWUS8e_Asf zRGj$GY-0&k3+MiRK9gbKDv3jRvi!AHJg1{KV>BRwy%inRugaTyJ*nnFGS(K`=}^!g zDS-}*YvZ|Bk0({h)xn;?I0@TL!iZ!*BCHV|O*C$)Al}NqbSPJvRGG4|Ca&RPGf>%3 z?04lk)PCW;9by|CiFnB}AsJKeurU~=IGw4825`=fi&_$$H;i<6*RVCOF`E{%)povx zg!C|`dH-Dd5BynejZ9t~_)>gDqUH57w=N#4bO}Zp`hw6R1O;3UBLo4_s4eL@>fm3| zSIh9Xjo=RVD~Edl86m*LbV&4c(#daR5{dk{n~gf+t$2@WG?m1r>_Da-pSH9@;(Hw+ zWbxzL+~QsrnjO7;S7b6zd>7!Db+6Y5;fuNPw-WBo0N+p!(WVjN6J2uSNJw|P?l diff --git a/tests/sample-fixed-cookies.txt b/tests/sample-fixed-cookies.txt deleted file mode 100644 index e8183c2..0000000 --- a/tests/sample-fixed-cookies.txt +++ /dev/null @@ -1,57 +0,0 @@ -# Netscape HTTP Cookie File - -groups.google.com FALSE / TRUE 1611077714 SSID hidden-text -groups.google.com FALSE / FALSE 1611077714 APISID hidden-text -groups.google.com FALSE / TRUE 1611077714 SAPISID hidden-text -groups.google.com FALSE / FALSE 2146723199 CONSENT hidden-text -groups.google.com FALSE / FALSE 1563816914 NID hidden-text -groups.google.com FALSE / FALSE 1596216768 __utmx hidden-text -groups.google.com FALSE / FALSE 1596216768 __utmxx hidden-text -groups.google.com FALSE / FALSE 1611077708 SID hidden-text -groups.google.com FALSE / FALSE 1611077708 HSID hidden-text -groups.google.com FALSE / TRUE 1611077708 SSID hidden-text -groups.google.com FALSE / FALSE 1611077708 APISID hidden-text -groups.google.com FALSE / TRUE 1611077708 SAPISID hidden-text -groups.google.com FALSE / FALSE 2146723199 CONSENT hidden-text -groups.google.com FALSE / FALSE 1563818795 NID hidden-text -groups.google.com FALSE / FALSE 1550599595 OGPC hidden-text -groups.google.com FALSE / FALSE 1550601067 1P_JAR hidden-text -groups.google.com FALSE / FALSE 1555785148 SIDCC hidden-text -groups.google.com FALSE /ads FALSE 1594623600 AID hidden-text -groups.google.com FALSE /ads/measurement FALSE 1549218527 TAID hidden-text -groups.google.com FALSE /complete/search FALSE 1563784933 CGIC hidden-text -groups.google.com FALSE /gmail/about/ FALSE 1548006168 __utmt hidden-text -groups.google.com FALSE /gmail/about/ FALSE 1611077683 __utma hidden-text -groups.google.com FALSE /gmail/about/ FALSE 1563773683 __utmz hidden-text -groups.google.com FALSE /gmail/about/ FALSE 1548007483 __utmb hidden-text -groups.google.com FALSE /gmail/about/ FALSE 0 __utmc hidden-text -groups.google.com FALSE /search FALSE 1563784933 CGIC hidden-text -groups.google.com FALSE /verify TRUE 1563817480 SNID hidden-text -groups.google.com TRUE / TRUE 1611077714 OSID hidden-text -groups.google.com TRUE / TRUE 0 S hidden-text -groups.google.com TRUE /mail TRUE 1548869698 COMPASS hidden-text -groups.google.com TRUE /mail/u/0 TRUE 0 GMAIL_AT hidden-text -groups.google.com TRUE /sync/u/0 TRUE 1548869719 COMPASS hidden-text -groups.google.com TRUE / TRUE 1611077722 OSID hidden-text -groups.google.com TRUE / TRUE 1550597724 OTZ hidden-text -groups.google.com FALSE / FALSE 0 __utmc hidden-text -groups.google.com FALSE / FALSE 1548006891 __utmt_*groups_ga* hidden-text -groups.google.com FALSE / FALSE 1611078291 __utma hidden-text -groups.google.com FALSE / FALSE 1548008091 __utmb hidden-text -groups.google.com FALSE / FALSE 1563774291 __utmz hidden-text -groups.google.com TRUE / TRUE 0 groupsloginpref hidden-text -groups.google.com TRUE / TRUE 1550597719 OTZ hidden-text -groups.google.com TRUE / TRUE 1611077708 ACCOUNT_CHOOSER hidden-text -groups.google.com TRUE / TRUE 1611078283 GAPS hidden-text -groups.google.com TRUE / TRUE 1611078283 LSID hidden-text -groups.google.com FALSE / FALSE 1548006309 __utmt_t0 hidden-text -groups.google.com FALSE / FALSE 1611077713 __utma hidden-text -groups.google.com FALSE / FALSE 1548007513 __utmb hidden-text -groups.google.com FALSE / FALSE 1563773713 __utmz hidden-text -groups.google.com FALSE / FALSE 0 __utmc hidden-text -groups.google.com TRUE / TRUE 1611077708 OSID hidden-text -groups.google.com TRUE / TRUE 1550597709 OTZ hidden-text -groups.google.com FALSE /insights/consumersurveys FALSE 1550598285 PAIDCONTENT hidden-text -groups.google.com TRUE / FALSE 1548009524 DV hidden-text -groups.google.com FALSE / FALSE 1594598400 AID hidden-text -groups.google.com TRUE / FALSE 1548008086 google_push hidden-text diff --git a/tests/sample-original-cookies.txt b/tests/sample-original-cookies.txt deleted file mode 100644 index d8af7b7..0000000 --- a/tests/sample-original-cookies.txt +++ /dev/null @@ -1,57 +0,0 @@ -.google.de FALSE / FALSE 1611077714 SID hidden-text -#HttpOnly_.google.de FALSE / FALSE 1611077714 HSID hidden-text -#HttpOnly_.google.de FALSE / TRUE 1611077714 SSID hidden-text -.google.de FALSE / FALSE 1611077714 APISID hidden-text -.google.de FALSE / TRUE 1611077714 SAPISID hidden-text -.google.de FALSE / FALSE 2146723199 CONSENT hidden-text -#HttpOnly_.google.de FALSE / FALSE 1563816914 NID hidden-text -.google.com FALSE / FALSE 1596216768 __utmx hidden-text -.google.com FALSE / FALSE 1596216768 __utmxx hidden-text -.google.com FALSE / FALSE 1611077708 SID hidden-text -#HttpOnly_.google.com FALSE / FALSE 1611077708 HSID hidden-text -#HttpOnly_.google.com FALSE / TRUE 1611077708 SSID hidden-text -.google.com FALSE / FALSE 1611077708 APISID hidden-text -.google.com FALSE / TRUE 1611077708 SAPISID hidden-text -.google.com FALSE / FALSE 2146723199 CONSENT hidden-text -#HttpOnly_.google.com FALSE / FALSE 1563818795 NID hidden-text -.google.com FALSE / FALSE 1550599595 OGPC hidden-text -.google.com FALSE / FALSE 1550601067 1P_JAR hidden-text -.google.com FALSE / FALSE 1555785148 SIDCC hidden-text -#HttpOnly_.google.com FALSE /ads FALSE 1594623600 AID hidden-text -#HttpOnly_.google.com FALSE /ads/measurement FALSE 1549218527 TAID hidden-text -#HttpOnly_.google.com FALSE /complete/search FALSE 1563784933 CGIC hidden-text -.google.com FALSE /gmail/about/ FALSE 1548006168 __utmt hidden-text -.google.com FALSE /gmail/about/ FALSE 1611077683 __utma hidden-text -.google.com FALSE /gmail/about/ FALSE 1563773683 __utmz hidden-text -.google.com FALSE /gmail/about/ FALSE 1548007483 __utmb hidden-text -.google.com FALSE /gmail/about/ FALSE 0 __utmc hidden-text -#HttpOnly_.google.com FALSE /search FALSE 1563784933 CGIC hidden-text -#HttpOnly_.google.com FALSE /verify TRUE 1563817480 SNID hidden-text -#HttpOnly_mail.google.com TRUE / TRUE 1611077714 OSID hidden-text -#HttpOnly_mail.google.com TRUE / TRUE 0 S hidden-text -#HttpOnly_mail.google.com TRUE /mail TRUE 1548869698 COMPASS hidden-text -mail.google.com TRUE /mail/u/0 TRUE 0 GMAIL_AT hidden-text -#HttpOnly_mail.google.com TRUE /sync/u/0 TRUE 1548869719 COMPASS hidden-text -#HttpOnly_notifications.google.com TRUE / TRUE 1611077722 OSID hidden-text -notifications.google.com TRUE / TRUE 1550597724 OTZ hidden-text -.groups.google.com FALSE / FALSE 0 __utmc hidden-text -.groups.google.com FALSE / FALSE 1548006891 __utmt_*groups_ga* hidden-text -.groups.google.com FALSE / FALSE 1611078291 __utma hidden-text -.groups.google.com FALSE / FALSE 1548008091 __utmb hidden-text -.groups.google.com FALSE / FALSE 1563774291 __utmz hidden-text -groups.google.com TRUE / TRUE 0 groupsloginpref hidden-text -contacts.google.com TRUE / TRUE 1550597719 OTZ hidden-text -#HttpOnly_accounts.google.com TRUE / TRUE 1611077708 ACCOUNT_CHOOSER hidden-text -#HttpOnly_accounts.google.com TRUE / TRUE 1611078283 GAPS hidden-text -#HttpOnly_accounts.google.com TRUE / TRUE 1611078283 LSID hidden-text -.myaccount.google.com FALSE / FALSE 1548006309 __utmt_t0 hidden-text -.myaccount.google.com FALSE / FALSE 1611077713 __utma hidden-text -.myaccount.google.com FALSE / FALSE 1548007513 __utmb hidden-text -.myaccount.google.com FALSE / FALSE 1563773713 __utmz hidden-text -.myaccount.google.com FALSE / FALSE 0 __utmc hidden-text -#HttpOnly_myaccount.google.com TRUE / TRUE 1611077708 OSID hidden-text -myaccount.google.com TRUE / TRUE 1550597709 OTZ hidden-text -.www.google.com FALSE /insights/consumersurveys FALSE 1550598285 PAIDCONTENT hidden-text -www.google.com TRUE / FALSE 1548009524 DV hidden-text -#HttpOnly_.googleadservices.com FALSE / FALSE 1594598400 AID hidden-text -x.bidswitch.net TRUE / FALSE 1548008086 google_push hidden-text diff --git a/tests/tests.sh b/tests/tests.sh index fe6b56b..c4b4b86 100755 --- a/tests/tests.sh +++ b/tests/tests.sh @@ -8,7 +8,7 @@ _test_public_1() { echo >&2 "" echo >&2 ":: --> Testing Public Group $_GROUP (ORG: ${_ORG:-}) <--" - echo >&2 ":: --> _WGET_OPTIONS: ${_WGET_OPTIONS:-}" + echo >&2 ":: --> _CURL_OPTIONS: ${_CURL_OPTIONS:-}" echo >&2 "" echo >&2 ":: Removing $PWD/$_D_OUTPUT" rm -rf "$PWD/$_D_OUTPUT/" @@ -37,7 +37,7 @@ _test_reset() { unset _D_OUTPUT unset _F_OUTPUT unset _GREP_MESSAGE - unset _WGET_OPTIONS + unset _CURL_OPTIONS } _test_public_1_with_cat() { @@ -47,14 +47,14 @@ _test_public_1_with_cat() { _test_public_1 ) } + _test_public_2_loop_detection() { ( _test_reset export _ORG="viettug.org" export _GROUP="google-group-crawler-public2" - export _WGET_OPTIONS="--load-cookies /dev/null --keep-session-cookies" _test_public_1 - [[ $? == 1 ]] \ + [[ $? == 125 ]] \ || { echo >&2 ":: Unable to detect a loop." return 1 @@ -68,7 +68,7 @@ _test_public_2_with_cookie() { _test_reset export _ORG="viettug.org" export _GROUP="google-group-crawler-public2" - export _WGET_OPTIONS="--load-cookies $(pwd -P)/private-cookies.txt --keep-session-cookies" + export _CURL_OPTIONS="--config curl-options.txt" export _GREP_MESSAGE="This is a public group from a private organization" _test_public_1 ) @@ -78,7 +78,7 @@ _test_private_1() { ( _test_reset export _GROUP="google-group-crawler-private" - export _WGET_OPTIONS="--load-cookies $(pwd -P)/private-cookies.txt --keep-session-cookies" + export _CURL_OPTIONS="--config curl-options.txt" _test_public_1 ) } @@ -92,6 +92,6 @@ export PATH="$PATH:$(pwd -P)/../" _test_public_1 || exit 1 _test_public_1_with_cat || exit 1 -_test_public_2_loop_detection || exit 1 +#_test_public_2_loop_detection || exit 1 _test_public_2_with_cookie || exit 2 _test_private_1 || exit 3 From 1fc09d0ec56e7c6513d07789f5cfbb0601c862ab Mon Sep 17 00:00:00 2001 From: Ky-Anh Huynh Date: Mon, 13 Apr 2020 07:43:04 +0200 Subject: [PATCH 2/2] Update ChangeLog, version 2.0.0 --- CHANGELOG.md | 6 ++++++ README.md | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2523ef9..33eb6e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## v2.0.0 + +* Using `curl` instead of `wget` +* Fix #36 (unable to read cookie file) +* Fix #34 (`413 Request Entity Too Large`) + ## v1.2.2 * Loop detection: #24. diff --git a/README.md b/README.md index da5ea81..1d80fac 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,8 @@ now we are using `curl` with cookie string and a configuration file. user-agent = "Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0" header = "Cookie: " - Of course, replace the `` part with your own cookie strings + Of course, replace the `` part with your own cookie strings. + See `man curl` for more details of the file format. 2. Specify your cookie file by `_CURL_OPTIONS`: