btwotch    

, Deutschland · männlich · registriert seit 2008 · heute zuletzt online


mailto, against spam!

N900 Hostmode

Computer ·

mount your n900 on a n900 :grins:


n900 <-> n900

–> http://www.spin.de/hp/sebastian_m/blog/id/16637281#entrymain

20. April 2011 22:48

Tags:  ·  ·  ·  ·  ·

How to download a website?

Sonstiges · · 4 Kommentare

What if you wanna make an archive of a website?

  • you print it into a pdf
  • you just save it

What if you wanna do that daily?

  • browse daily and save or print into pdf
  • use wget –recursive –level=$foo
  • use httrack

So?

  • browsing daily is too time-consuming
  • wget and httrack just fail (I didn't try them for a long time)
  • U wanna use my tool :grins:

This tool/library is not ready for real use; it shall become a library, but api access is not fully
working yet :shakehead: so this is only a demo

So what does this software?

  • using libxml to scan for html tags (img, script, style, ...)
  • convert to utf8
  • scan css files for urls :klatsch:
  • correct relative urls (<a href=...)
  • download all these files very fast (http://www.wurstball.de/all takes only about 5mins - >>14000 files)

here's the code for the library

    1 #include <stdio.h>
    2 #include <stdbool.h>
    3 #include <string.h>
    4 #include <strings.h>
    5 #include <libxml/parser.h>
    6 #include <libxml/HTMLparser.h>
    7 #include <libxml/xmlerror.h>
    8 #include <curl/curl.h>
    9 #include <pthread.h> //
   10 
   11 #include "getpage.h"
   12 
   13 
   14 #define FILELENGTH 150
   15 #define CURL_TIMEOUT_SEC 240
   16 #define SELECT_TIMEOUT_SEC 10
   17 #define MAX_P_FILE_DOWNLOADS 10
   18 
   19 //#define DEBUG
   20 
   21 static char ALPHABET[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwx
   22                          yz1234567890";
   23 
   24 enum FILETYPE
   25 {
   26   IMG = 0x1,
   27   STYLE = 0x2,
   28   SCRIPT = 0x4,
   29   IFRAME = 0x8,
   30   FRAME = 0x10,
   31   PDF = 0x20,
   32   CSS_IMG = 0x40,
   33   NONE = 0x80
   34 };
   35 
   36 struct _replace_info
   37 {
   38   char *begin;
   39   char end;
   40 
   41 
   42   void (*userfunction) (void*, char*, int, bool);
   43   void *userdata;
   44 
   45   char *buffer;
   46   int begin_progress;
   47   int begin_length;
   48   bool inside_gap;
   49   int status;
   50 };
   51 
   52 struct _site_files
   53 {
   54   char *url;
   55   char *url2;
   56   char *filename;
   57   enum FILETYPE ft;
   58   struct _site_files *next;
   59   FILE *fp;
   60   struct _replace_info *ri;
   61   short nth_url;
   62 };
   63 
   64 struct _site_userdata
   65 {
   66   //void (*site_function)(void*, const char*, ...);
   67   void (*site_function)(void*, const char*, va_list);
   68   void *userdata;
   69 
   70   struct _site_files *sf;
   71   char *_base_url;
   72   char *_css_base_url;
   73   bool _utf8_meta_set;
   74   CURL *_mhnd;
   75   CURL *_hnd;
   76 };
   77 
   78 struct _css_filter_userdata
   79 {
   80   struct _site_userdata *su;
   81   char *url;
   82 };
   83 
   84 struct _css_filter_save_userdata
   85 {
   86   struct _site_userdata *su;
   87   FILE *fp;
   88   char *filename;
   89   char *url;
   90 };
   91 
   92 static char *_filetype_string(enum FILETYPE ft)
   93 {
   94   char *txt;
   95   switch (ft)
   96     {
   97     case IMG:
   98       txt = "IMG";
   99       break;
  100     case CSS_IMG:
  101       txt = "CSS_IMG";
  102       break;
  103     case STYLE:
  104       txt = "STYLE";
  105       break;
  106     case SCRIPT:
  107       txt = "SCRIPT";
  108       break;
  109     case IFRAME:
  110       txt = "IFRAME";
  111       break;
  112     case FRAME:
  113       txt = "FRAME";
  114       break;
  115     case PDF:
  116       txt = "PDF";
  117       break;
  118     case NONE:
  119       txt =  "OTHER";
  120       break;
  121     default:
  122       txt = "DEFAULT";
  123       break;
  124     }
  125 
  126   return txt;
  127 }
  128 
  129 static void _user_function(struct _site_userdata *su, const char *fmt, ...)
  130 {
  131   va_list ap;
  132   va_start(ap, fmt);
  133   su->site_function(su->userdata, fmt, ap);
  134   va_end(ap);
  135 }
  136 
  137 static char *__join_together(char *a, char *b, int len_b)
  138 {
  139   int len_a = 0;
  140   int i;
  141   char *new;
  142 
  143   if (a != NULL)
  144     len_a += strlen(a);
  145 
  146   new = realloc(a, len_b+1+len_a);
  147 
  148   if (new != NULL)
  149     {
  150       for (i = 0; i < len_b; i++)
  151         new[i+len_a] = b[i];
  152 
  153       new[len_a+len_b] = '\0';
  154     }
  155 
  156   return new;
  157 }
  158 
  159 // return true if inside gap -> 1
  160 // return false if outside gap -> -1
  161 static int inline replace_step(struct _replace_info *ri, char txt)
  162 {
  163   if (txt == ri->begin[ri->begin_progress])
  164     ri->begin_progress++;
  165   else
  166     ri->begin_progress = 0;
  167 
  168   if (ri->begin_progress == ri->begin_length)
  169     {
  170       ri->begin_progress = 0;
  171       ri->inside_gap = true;
  172       return -1;
  173     }
  174 
  175   if (ri->inside_gap)
  176     {
  177       if (txt == ri->end)
  178         {
  179           ri->inside_gap = false;
  180           return -1;
  181         }
  182       else
  183         return 1;
  184     }
  185 
  186   return -1;
  187 }
  188 
  189 static void replace(struct _replace_info *ri, char *txt, int length)
  190 {
  191   int i;
  192   int offset = 0;
  193   int status_temp = -1;
  194 
  195 
  196   for (i = 0; i < length; i++)
  197     {
  198       status_temp = replace_step(ri, txt[i]);
  199 
  200       if (ri->status != status_temp)
  201         {
  202           if (ri->buffer != NULL)
  203             {
  204               if (ri->status == 1)
  205                 {
  206                   ri->userfunction(ri->userdata, ri->buffer, strlen(ri->
  207                                    buffer), true);
  208                 }
  209               else if (ri->status == -1)
  210                 {
  211                   ri->userfunction(ri->userdata, ri->buffer, strlen(ri->
  212                                    buffer), false);
  213                 }
  214               free(ri->buffer);
  215               ri->buffer = NULL;
  216             }
  217 
  218           if (ri->status == 1)
  219             {
  220               ri->userfunction(ri->userdata, txt+offset, i-offset, true);
  221             }
  222           else if (ri->status == -1)
  223             {
  224               ri->userfunction(ri->userdata, txt+offset, i-offset, false);
  225             }
  226 
  227           offset = i;
  228         }
  229       ri->status = status_temp;
  230     }
  231 
  232   if (offset != length)
  233     {
  234       if (status_temp == 1 || status_temp == -1)
  235         {
  236           ri->userfunction(ri->userdata, txt+offset, i-offset, ri->status
  237                            == 1 ? true : false);
  238         }
  239       else
  240         {
  241           if (txt[length-1] == '\0')
  242             {
  243               if (ri->buffer != NULL)
  244                 {
  245                   ri->userfunction(ri->userdata, ri->buffer, strlen(ri->
  246                                    buffer), false);
  247                 }
  248               free(ri->buffer);
  249               ri->buffer = NULL;
  250               ri->userfunction(ri->userdata, txt+offset, length-offset,
  251                                false);
  252             }
  253           else
  254             ri->buffer = __join_together(ri->buffer, txt+offset, length-
  255                          offset);
  256         }
  257     }
  258 
  259 }
  260 
  261 static void _set_chnd(CURL *hnd, char *url, void *cbfunction, void
  262                       *userdata)
  263 {
  264   curl_easy_setopt(hnd, CURLOPT_INFILESIZE_LARGE, (curl_off_t)-1);
  265   curl_easy_setopt(hnd, CURLOPT_URL, url);
  266   curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1);
  267   curl_easy_setopt(hnd, CURLOPT_FAILONERROR, 0);
  268   curl_easy_setopt(hnd, CURLOPT_USERAGENT, "libmessage - btwotch+
  269                    libmessage@XXX.com");
  270   //curl_easy_setopt(hnd, CURLOPT_USERAGENT, "Mozilla/5.0 (X11; U; Linux
  271                      x86_64; en-US) AppleWebKit/534.3 (KHTML, like Gecko)
  272                      Chrome/6.0.472.62 Safari/534.3");
  273   curl_easy_setopt(hnd, CURLOPT_RESUME_FROM_LARGE, (curl_off_t)0);
  274   curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50);
  275   curl_easy_setopt(hnd, CURLOPT_SSLVERSION, 0);
  276   curl_easy_setopt(hnd, CURLOPT_TIMECONDITION, 0);
  277   curl_easy_setopt(hnd, CURLOPT_TIMEVALUE, 0);
  278   curl_easy_setopt(hnd, CURLOPT_CUSTOMREQUEST, NULL);
  279   curl_easy_setopt(hnd, CURLOPT_CONNECTTIMEOUT, CURL_TIMEOUT_SEC);
  280   curl_easy_setopt(hnd, CURLOPT_TIMEOUT, CURL_TIMEOUT_SEC);
  281   curl_easy_setopt(hnd, CURLOPT_HTTPAUTH, 1);
  282   curl_easy_setopt(hnd, CURLOPT_ENCODING, NULL);
  283   curl_easy_setopt(hnd, CURLOPT_IPRESOLVE, 0);
  284   curl_easy_setopt(hnd, CURLOPT_IGNORE_CONTENT_LENGTH, 0);
  285   curl_easy_setopt(hnd, CURLOPT_POSTREDIR, 0);
  286   curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, cbfunction);
  287   curl_easy_setopt(hnd, CURLOPT_WRITEDATA, userdata);
  288   curl_easy_setopt(hnd, CURLOPT_FOLLOWLOCATION, 1);
  289   curl_easy_setopt(hnd, CURLOPT_NOSIGNAL, 1);
  290   curl_easy_setopt(hnd, CURLOPT_AUTOREFERER, 1);
  291   curl_easy_setopt(hnd, CURLOPT_ENCODING, "deflate");
  292 }
  293 
  294 static void _filename_gen(struct _site_files *first_sf, char *filename)
  295 {
  296   int i;
  297   bool name_double;
  298   struct _site_files *sf;
  299 
  300   do
  301     {
  302       name_double = false;
  303       srand(1337^filename[0]);
  304 
  305       for (i = FILELENGTH/2; i < FILELENGTH; i++)
  306         filename[i] = ALPHABET[rand()% (strlen(ALPHABET)-1)];
  307 
  308       filename[FILELENGTH-1] = '\0';
  309 
  310       sf = first_sf;
  311 
  312       while (sf != NULL && sf->filename != NULL)
  313         {
  314           if (!strcasecmp(sf->filename, filename))
  315             name_double = true;
  316 
  317           sf = sf->next;
  318         }
  319     }
  320   while (name_double);
  321 }
  322 
  323 
  324 static char* _shrink_url(char *rurl) // remove apostrophes etc.
  325 {
  326   int length;
  327 
  328   while (rurl[0] != '\0' && rurl[0] == ' ')
  329     rurl++;
  330   length = strlen(rurl);
  331   for (int i = 0; i < length/2; i++)
  332     if (rurl[i] == '\'' || rurl[i] == '\"')
  333       {
  334         if (rurl[i] == rurl[length-i-1])
  335           {
  336             rurl[length-i-1] = '\0';
  337             rurl++;
  338           }
  339       }
  340     else
  341       break;
  342 
  343 
  344   return rurl;
  345 }
  346 
  347 static void _crap_sites_aburl(char **abs_url, CURL *hnd, char *rurl, char
  348                               *static_burl)
  349 {
  350   int abs_urllen;
  351 
  352   if (!strncasecmp(rurl, "//", 2)) // gmx-hack
  353     {
  354       abs_urllen = 5+strlen(rurl)+1;
  355       *abs_url = malloc(abs_urllen*sizeof(char));
  356       snprintf(*abs_url, abs_urllen, "http:%s", rurl);
  357     }
  358 
  359 }
  360 
  361 static void _relative_aburl(char **abs_url, CURL *hnd, char *rurl, char
  362                             *static_burl, short nth_url)
  363 {
  364   int abs_urllen;
  365   int domain_end = 0, i;
  366   int base_len = 0;
  367   char *burl = NULL;
  368 
  369   if (strncasecmp(rurl, "http://", 7) &&
  370       strncasecmp(rurl, "https://", 8) &&
  371       strncasecmp(rurl, "ftp://", 6) &&
  372       strncasecmp(rurl, "file://", 7) &&
  373       strncasecmp(rurl, "about:", 6) &&
  374       strncasecmp(rurl, "javascript:", 11))
  375     {
  376       if (static_burl != NULL)
  377         burl = static_burl;
  378       else if (curl_easy_getinfo(hnd, CURLINFO_EFFECTIVE_URL, &burl) !=
  379                CURLE_OK)
  380         {
  381           fprintf(stderr, "CURLINFO_EFFECTIVE_URL failed\n");
  382           exit(1);
  383         }
  384 
  385       if (!strncasecmp(burl, "http://", 7))
  386         domain_end = 7;
  387       else if (!strncasecmp(burl, "https://", 8))
  388         domain_end = 8;
  389       else if (!strncasecmp(burl, "ftp://", 6))
  390         domain_end = 6;
  391       else if (!strncasecmp(burl, "file://", 6))
  392         domain_end = strlen(burl);
  393 
  394       for (i = domain_end+1; i < strlen(burl); i++)
  395         {
  396           if (burl[i] == '/')
  397             {
  398               if (i < strlen(burl)-1)
  399                 if (burl[i+1] == '/')
  400                   continue;
  401 
  402               if (nth_url == 1)
  403                 {
  404                   base_len = i;
  405                   break;
  406                 }
  407               else
  408                 nth_url--;
  409             }
  410         }
  411       if (base_len == 0)
  412         base_len = strlen(burl);
  413 
  414       abs_urllen = strlen(rurl) + strlen(burl) + 2;
  415       *abs_url = malloc(sizeof(char)*abs_urllen);
  416       snprintf(*abs_url, abs_urllen, "%.*s/%s", base_len, burl, rurl);
  417     }
  418 
  419 }
  420 
  421 static char* _absolute_url(CURL *hnd, char *rurl, char *static_burl, short
  422                            nth_url)
  423 {
  424   char *abs_url = NULL;
  425 
  426   if (nth_url == 1)
  427     {
  428       _crap_sites_aburl(&abs_url, hnd, rurl, static_burl);
  429       if (abs_url != NULL)
  430         return abs_url;
  431     }
  432 
  433   _relative_aburl(&abs_url, hnd, rurl, static_burl, nth_url);
  434   if (abs_url != NULL)
  435     return abs_url;
  436 
  437   if (nth_url == 1)
  438     {
  439       int abs_urllen = strlen(rurl)+1;
  440       abs_url = malloc(abs_urllen+1);
  441       strncpy(abs_url, rurl, abs_urllen);
  442       //abs_url = strdup(rurl);
  443     }
  444 
  445   return abs_url;
  446 
  447 }
  448 
  449 static char *_site_files_add(struct _site_userdata *su, char *url, char
  450                              *base_url, enum FILETYPE ft)
  451 {
  452   struct _site_files *sf = su->sf;
  453   char *newurl, *newfilename, *sec_url;
  454   int i;
  455   int url_length;
  456   int filename_length;
  457 
  458   if (url == NULL)
  459     return NULL;
  460 
  461   url_length = strlen(url)+1;
  462 
  463   url = _shrink_url(url);
  464   newurl = _absolute_url(su->_hnd, url, (base_url == NULL) ? su->_base_url :
  465            base_url, 1);
  466   sec_url = _absolute_url(su->_hnd, url, (base_url == NULL) ? su->_base_url
  467             : base_url, 2);
  468 
  469   if (sf != NULL)
  470     {
  471       if (!strcmp(sf->url, newurl))
  472         {
  473           free(newurl);
  474           return sf->filename;
  475         }
  476       while (sf->next != NULL)
  477         {
  478           sf = sf->next;
  479           if (!strcmp(sf->url, newurl))
  480             {
  481               free(newurl);
  482               return sf->filename;
  483             }
  484         }
  485       sf->next = malloc(sizeof(struct _site_files));
  486       sf = sf->next;
  487       sf->ri = NULL;
  488       sf->next = NULL;
  489     }
  490   else
  491     {
  492       sf = malloc(sizeof(struct _site_files));
  493       sf->ri = NULL;
  494       su->sf = sf;
  495       sf->next = NULL;
  496     }
  497 
  498   sf->filename = NULL;
  499   sf->ft = ft;
  500   sf->url = newurl;
  501   sf->url2 = sec_url;
  502 
  503 
  504   filename_length = strlen(newurl)+1;
  505   if (filename_length > FILELENGTH)
  506     filename_length = FILELENGTH;
  507 
  508   newfilename = malloc(sizeof(char)*(filename_length));
  509   strncpy(newfilename, sf->url, filename_length);
  510   if (filename_length == FILELENGTH)
  511     _filename_gen(sf, newfilename);
  512 
  513   sf->filename = newfilename;
  514 
  515   for (i = 0; i < strlen(sf->filename); i++)
  516     {
  517       if (sf->filename[i] == '/')
  518         sf->filename[i] = '_';
  519       else if (sf->filename[i] == '?')
  520         sf->filename[i] = '_';
  521       else if (sf->filename[i] == '#')
  522         sf->filename[i] = '_';
  523       else if (sf->filename[i] == '@')
  524         sf->filename[i] = '_';
  525       else if (sf->filename[i] == '%')
  526         sf->filename[i] = '_';
  527       else if (sf->filename[i] == ':')
  528         sf->filename[i] = '_';
  529       else if (sf->filename[i] == ' ')
  530         sf->filename[i] = '_';
  531     }
  532 
  533   return sf->filename;
  534 }
  535 
  536 void _save_file_css_save(void *userdata, char *gap, int length, bool gapped)
  537 {
  538   struct _css_filter_save_userdata *cfsu = (struct
  539                                            _css_filter_save_userdata*)
  540                                            userdata;
  541   char *filename;
  542 
  543   if (cfsu->fp == NULL)
  544     { // first call of this func.
  545       cfsu->fp = fopen(cfsu->filename, "w");
  546       cfsu->url = NULL;
  547     }
  548 
  549   if (gapped)
  550     cfsu->url = __join_together(cfsu->url, gap, length);
  551   else if (!gapped && cfsu->url != NULL)
  552     {
  553       filename = _site_files_add(cfsu->su, cfsu->url, cfsu->su->
  554                  _css_base_url, CSS_IMG);
  555       fprintf(cfsu->fp, "%s", filename);
  556       free(cfsu->url);
  557       cfsu->url = NULL;
  558       fprintf(cfsu->fp, "%.*s", length, gap);
  559     }
  560   else
  561     fprintf(cfsu->fp, "%.*s", length, gap);
  562 
  563 }
  564 
  565 
  566 size_t _save_file_css(char *txt, size_t size, size_t nmemb, struct
  567                       _site_files *sf) // feed the replacer!
  568 {
  569   if (size == 0 && nmemb == 0 && sf->fp != NULL)
  570     {
  571       fclose(sf->fp);
  572     }
  573   else if (sf->fp == NULL)
  574     sf->fp=fopen(sf->filename, "w");
  575 
  576   if (sf->fp == NULL)
  577     {
  578       perror("fopen");
  579       return 0;
  580     }
  581 
  582   replace(sf->ri, txt, size*nmemb);
  583 
  584   return size*nmemb;
  585 }
  586 
  587 size_t _save_file(char *txt, size_t size, size_t nmemb, struct _site_files
  588                   *sf)
  589 {
  590   int i;
  591 
  592   if (size == 0 && nmemb == 0 && sf->fp != NULL)
  593     {
  594       fclose(sf->fp);
  595     }
  596   else if (sf->fp == NULL)
  597     sf->fp=fopen(sf->filename, "w");
  598 
  599   if (sf->fp == NULL)
  600     {
  601       perror("fopen");
  602       return 0;
  603     }
  604 
  605   for (i = 0; i < size*nmemb; i++)
  606     fputc(txt[i], sf->fp);
  607 
  608   return size*nmemb;
  609 }
  610 
  611 static void _set_css_ri(struct _replace_info *ri, void *userdata, void
  612                         *userfunction)
  613 {
  614   ri->begin = "url(";
  615   ri->end = ')';
  616 
  617   ri->userfunction = userfunction;
  618   ri->userdata = userdata;
  619 
  620   ri->buffer = NULL;
  621 
  622   ri->begin_progress = 0;
  623   ri->begin_length = 4;
  624   ri->inside_gap = false;
  625 
  626   ri->status = -1;
  627 }
  628 
  629 static void _add_download_files(struct _site_files *sf, struct
  630                                 _site_userdata *su, CURL *mhnd, short
  631                                 nth_url)
  632 {
  633   struct _css_filter_save_userdata *cfsu;
  634   CURL *hnd;
  635 
  636   hnd = curl_easy_init();
  637   sf->fp = NULL;
  638   sf->nth_url = nth_url;
  639 
  640 
  641 #ifdef DEBUG
  642   fprintf(stderr, "Download (%s) %s -> %s\n", _filetype_string(sf->ft), sf->
  643           url, sf->filename);
  644 #endif
  645 
  646   if (sf->ft == STYLE)
  647     {
  648       sf->ri = malloc(sizeof(struct _replace_info));
  649       su->_css_base_url = sf->url;
  650       cfsu = malloc(sizeof(struct _css_filter_save_userdata));
  651       cfsu->fp = NULL;
  652       cfsu->filename = sf->filename;
  653       cfsu->su = su;
  654       _set_css_ri(sf->ri, cfsu, _save_file_css_save);
  655       if (nth_url == 1)
  656         _set_chnd(hnd, sf->url, _save_file_css, sf);
  657       else if (nth_url == 2)
  658         _set_chnd(hnd, sf->url2, _save_file_css, sf);
  659     }
  660   else if (nth_url == 1)
  661     _set_chnd(hnd, sf->url, _save_file, sf);
  662   else if (nth_url == 2)
  663     _set_chnd(hnd, sf->url2, _save_file, sf);
  664 
  665   curl_easy_setopt(hnd, CURLOPT_PRIVATE, sf);
  666   curl_multi_add_handle(mhnd, hnd);
  667 }
  668 
  669 
  670 static void _download_files(struct _site_userdata *su)
  671 {
  672   int handles = 1, msgs_in_queue, maxfd;
  673   int iteration = 0;
  674   int downloads = 0; // current downloads
  675   char *curlinfo_private;
  676   CURL *mhnd;
  677   CURLMsg *cmsg;
  678   struct _site_files *first_sf = su->sf;
  679   struct _site_files *sf = first_sf;
  680   struct _site_files *tmp_sf;
  681   struct timeval timeout;
  682   fd_set fdread, fdwrite, fderr;
  683   char *burl;
  684 #ifdef DEBUG
  685   char *ip;
  686 #endif
  687   long response_code;
  688 
  689   if (sf == NULL)
  690     return;
  691   mhnd = curl_multi_init();
  692 
  693   _add_download_files(sf, su, mhnd, 1);
  694   downloads++;
  695   sf = sf->next;
  696 
  697   while (CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mhnd, &handles) &&
  698          handles != 0);
  699 
  700   do
  701     {
  702       iteration++;
  703       FD_ZERO(&fdread);
  704       FD_ZERO(&fdwrite);
  705       FD_ZERO(&fderr);
  706       timeout.tv_sec = SELECT_TIMEOUT_SEC;
  707       timeout.tv_usec = 0;
  708       curl_multi_fdset(mhnd, &fdread, &fdwrite, &fderr, &maxfd);
  709       switch (select(maxfd+1, &fdread, &fdwrite, &fderr, &timeout))
  710         {
  711         case -1:
  712 #ifdef DEBUG
  713           fprintf(stderr, "select bad :(\n");
  714           perror("!!! select failed ");
  715           while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) !=
  716                  NULL)
  717             {
  718               if (cmsg->data.result != 0)
  719                 {
  720                   curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIMARY_IP,
  721                                     &ip);
  722                   fprintf(stderr, "ip: %s url: %s result: %d", ip, burl,
  723                           cmsg->data.result);
  724                   if (cmsg->data.result == 7)
  725                     fprintf(stderr, " (couldn't connect)");
  726                   fprintf(stderr, "\n");
  727                 }
  728             }
  729           fprintf(stderr, "-----------\n");
  730 #endif
  731         default:
  732           while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) !=
  733                  NULL)
  734             if (cmsg->msg == CURLMSG_DONE)
  735               {
  736                 curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIVATE,
  737                                   &curlinfo_private);
  738                 tmp_sf = (struct _site_files*)curlinfo_private;
  739                 if (tmp_sf->ft == CSS_IMG)
  740                   _save_file_css(NULL, 0, 0, tmp_sf);
  741                 else
  742                   _save_file(NULL, 0, 0, tmp_sf);
  743 
  744                 downloads--;
  745 
  746                 curl_easy_getinfo(cmsg->easy_handle, CURLINFO_EFFECTIVE_URL,
  747                                   &burl);
  748                 curl_easy_getinfo(cmsg->easy_handle, CURLINFO_RESPONSE_CODE,
  749                                   &response_code);
  750                 if (response_code > 400)
  751                   {
  752                     if (tmp_sf->nth_url == 2)
  753                       {
  754                         fprintf(stderr, "Failed (%ld): %s -> %s (%s)   ",
  755                                 response_code, burl, tmp_sf->filename,
  756                                 _filetype_string(tmp_sf->ft));
  757                         fprintf(stderr, "second url: %s\n", tmp_sf->url2);
  758                       }
  759                     else
  760                       _add_download_files(tmp_sf, su, mhnd, 2);
  761                   }
  762 
  763                 curl_easy_cleanup(cmsg->easy_handle);
  764               }
  765           do
  766             {
  767               // download 1st file
  768               if (iteration == 1 && sf != NULL && downloads <
  769                   MAX_P_FILE_DOWNLOADS)
  770                 {
  771                   _add_download_files(sf, su, mhnd, 1);
  772                   downloads++;
  773                 }
  774 
  775 
  776               while (sf != NULL && sf->next != NULL && downloads <
  777                      MAX_P_FILE_DOWNLOADS) // sf->next must not be NULL as
  778                      we are adding to the list ;)
  779                 {
  780                   _add_download_files(sf->next, su, mhnd, 1);
  781                   downloads++;
  782                   sf = sf->next;
  783                 }
  784             }
  785           while (CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mhnd,
  786                  &handles) && handles != 0);
  787 
  788           break;
  789 
  790         }
  791     }
  792   while (handles != 0);
  793 
  794   while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL)
  795     {
  796       curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIVATE,
  797                         &curlinfo_private);
  798       tmp_sf = (struct _site_files*)curlinfo_private;
  799       if (tmp_sf->ft == CSS_IMG)
  800         _save_file_css(NULL, 0, 0, tmp_sf);
  801       else
  802         _save_file(NULL, 0, 0, tmp_sf);
  803       downloads--;
  804 
  805       curl_easy_getinfo(cmsg->easy_handle, CURLINFO_EFFECTIVE_URL, &burl);
  806       curl_easy_getinfo(cmsg->easy_handle, CURLINFO_RESPONSE_CODE,
  807                         &response_code);
  808       if (response_code > 400)
  809         {
  810           if (tmp_sf->nth_url == 2)
  811             {
  812               fprintf(stderr, "Failed (%ld): %s -> %s (%s)   ",
  813                       response_code, burl, tmp_sf->filename,
  814                       _filetype_string(tmp_sf->ft));
  815               fprintf(stderr, "second url: %s\n", tmp_sf->url2);
  816             }
  817           else
  818             _add_download_files(tmp_sf, su, mhnd, 2);
  819         }
  820 
  821       curl_easy_cleanup(cmsg->easy_handle);
  822     }
  823 
  824   sf = first_sf;
  825   while (sf != NULL)
  826     {
  827       free(sf->url);
  828       free(sf->url2);
  829       free(sf->filename);
  830       if (sf->ri != NULL)
  831         {
  832           free(sf->ri->userdata);
  833           free(sf->ri);
  834         }
  835       tmp_sf = sf;
  836       sf = sf->next;
  837       free(tmp_sf);
  838     }
  839 
  840   curl_multi_cleanup(mhnd);
  841 
  842 }
  843 
  844 void _css_filter(void *userdata, char *gap, int length, bool gapped)
  845 {
  846   struct _css_filter_userdata *cfu = (struct _css_filter_userdata*)
  847                                      userdata;
  848   char *filename;
  849 
  850   if (gapped)
  851     cfu->url = __join_together(cfu->url, gap, length);
  852   else if (!gapped && cfu->url != NULL)
  853     {
  854       filename = _site_files_add(cfu->su, cfu->url, NULL, CSS_IMG);
  855       _user_function(cfu->su, "%s", filename);
  856       free(cfu->url);
  857       cfu->url = NULL;
  858       _user_function(cfu->su, "%.*s", length, gap);
  859     }
  860   else
  861     _user_function(cfu->su, "%.*s", length, gap);
  862 }
  863 
  864 
  865 static void _getpage_startElementSAX (void * userData, const xmlChar * name,
  866                                       const xmlChar ** atts)
  867 {
  868   int i, j;
  869   char *n = (char*)name;
  870   char *filename, *url;
  871   struct _site_userdata *su = userData;
  872   struct _css_filter_userdata cfu;
  873   struct _replace_info ri;
  874 
  875 
  876   _user_function(su, "<%s", n);
  877 
  878   if (atts != NULL)
  879     for (i = 0; atts[i] != NULL; i+=2)
  880       {
  881         filename = NULL;
  882 
  883         if (!strncasecmp(n, "img", 4) && !strncasecmp((char*)atts[i], "src",
  884             4))
  885           {
  886             filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG);
  887             _user_function(su, " src=\"file:%s\"", filename);
  888           }
  889         else if (!strncasecmp(n, "input", 6) && !strncasecmp((char*)atts[i],
  890                  "src", 4))
  891           {
  892             filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG);
  893             _user_function(su, " src=\"file:%s\"", filename);
  894           }
  895         else if (!strncasecmp(n, "script", 7) && !strncasecmp((char*)atts[i]
  896                  , "src", 4))
  897           {
  898             filename = _site_files_add(su, (char*)atts[i+1], NULL, SCRIPT);
  899             _user_function(su, " src=\"file:%s\"", filename);
  900           }
  901         else if (!strncasecmp(n, "iframe", 7) && !strncasecmp((char*)atts[i]
  902                  , "src", 4))
  903           {
  904             filename = _site_files_add(su, (char*)atts[i+1], NULL, IFRAME);
  905             _user_function(su, " src=\"file:%s\"", filename);
  906           }
  907         else if (!strncasecmp((char*)atts[i], "style", 6))
  908           {
  909             cfu.su = su;
  910             cfu.url = NULL;
  911             _set_css_ri(&ri, &cfu, _css_filter);
  912             _user_function(su, " style=\"");
  913             replace(&ri, (char*)atts[i+1], strlen((char*)atts[i+1]));
  914             if (cfu.url != NULL)
  915               free(cfu.url);
  916             _user_function(su, "\"");
  917             filename = (void*)-1;
  918           }
  919         else if (!strncasecmp(n, "link", 5) && !strncasecmp((char*)atts[i],
  920                  "href", 5))
  921           {
  922             for (j = 0; atts[j] != NULL; j+=2)
  923               if (!strncasecmp((char*)atts[j], "rel", 4))
  924                 {
  925                   if (!strncasecmp((char*)atts[j+1], "stylesheet", 11))
  926                     {
  927                       filename = _site_files_add(su, (char*)atts[i+1], NULL,
  928                                  STYLE);
  929                       _user_function(su, " href=\"file:%s\"", filename);
  930                     }
  931                   else if (!strncasecmp((char*)atts[j+1], "icon", 5))
  932                     {
  933                       filename = _site_files_add(su, (char*)atts[i+1], NULL,
  934                                  IMG);
  935                       _user_function(su, " href=\"file:%s\"", filename);
  936                     }
  937                   else if (!strncasecmp((char*)atts[j+1], "shortcut icon",
  938                            14))
  939                     {
  940                       filename = _site_files_add(su, (char*)atts[i+1], NULL,
  941                                  IMG);
  942                       _user_function(su, " href=\"file:%s\"", filename);
  943                     }
  944                 }
  945           }
  946         else if (!strncasecmp(n, "a", 2) && !strncasecmp((char*)atts[i],
  947                  "href", 5))
  948           {
  949             url = _absolute_url(su->_hnd, (char*)atts[i+1], su->_base_url,
  950                   1);
  951             _user_function(su, " href=\"%s\"", url);
  952             free(url);
  953             filename = (void*)-1;
  954           }
  955         else if (!strncasecmp(n, "base", 5) && !strncasecmp((char*)atts[i],
  956                  "href", 5))
  957           {
  958             _user_function(su, " href=\".\"");
  959             filename = (void*)-1;
  960           }
  961         else if (!strncasecmp(n, "form", 5) && !strncasecmp((char*)atts[i],
  962                  "action", 7))
  963           {
  964             url = _absolute_url(su->_hnd, (char*)atts[i+1], su->_base_url,
  965                   1);
  966             _user_function(su, " action=\"%s\"", url);
  967             free(url);
  968             filename = (void*)-1;
  969           }
  970         else if (!strncasecmp(n, "meta", 5) && !strncasecmp((char*)atts[i],
  971                  "http-equiv", 8) && !strncasecmp((char*)atts[i+1],
  972                  "Content-Type", 13))
  973           {
  974             su->_utf8_meta_set = true;
  975             _user_function(su, " http-equiv=\"Content-Type\"
  976                            content=\"text/html; charset=utf-8\"");
  977             //filename = (void*)-1;
  978             break;
  979           }
  980 
  981 
  982         if (filename == NULL)
  983           _user_function(su, " %s=\"%s\"", (char*)atts[i], (char*)atts[i+1])
  984                          ;
  985 
  986       }
  987 
  988   _user_function(su, ">");
  989 
  990 }
  991 
  992 static void _getpage_endElementSAX (void * userData, const xmlChar * name)
  993 {
  994   char *n = (char*)name;
  995   struct _site_userdata *su = userData;
  996 
  997   if (!strncasecmp("head", n, 5) && !su->_utf8_meta_set)
  998     _user_function(su, "<meta  http-equiv=\"Content-Type\"
  999                    content=\"text/html; charset=utf-8\"/> </head>");
 1000   else if (strncasecmp("br", n, 3) && strncasecmp("img", n, 4) &&
 1001            strncasecmp("meta", n, 5) && strncasecmp("link", n, 5) &&
 1002            strncasecmp("input", n, 5))
 1003     _user_function(su, "</%s>\n", n);
 1004 }
 1005 
 1006 static void _getpage_charDataSAX (void * userData, const xmlChar * buffer,
 1007                                   int length)
 1008 {
 1009   struct _site_userdata *su = userData;
 1010   _user_function(su, "%.*s", length, buffer);
 1011 }
 1012 
 1013 static size_t _chunk_parse(void *ptr, size_t size, size_t nmemb,
 1014                            xmlParserCtxtPtr ctxt)
 1015 {
 1016   char *txt = ptr;
 1017 #ifdef DEBUG
 1018   FILE *fp = fopen("bare.txt", "a+");
 1019 
 1020   fprintf(fp, "%.*s", (int)(size*nmemb), txt);
 1021   fclose(fp);
 1022 #endif
 1023   htmlParseChunk(ctxt, txt, size*nmemb, 0);
 1024 
 1025   return nmemb*size;
 1026 }
 1027 
 1028 void getpage(char *url, void *site_function, void *userdata)
 1029 {
 1030   struct _site_userdata su;
 1031   su.site_function = site_function;
 1032   su.userdata = userdata;
 1033   su.sf = NULL;
 1034   su._utf8_meta_set = false;
 1035   su._base_url = NULL;
 1036   su._css_base_url = NULL;
 1037   CURLcode ret;
 1038 
 1039   htmlSAXHandler hsh;
 1040   htmlParserCtxtPtr ctxt;
 1041 
 1042 #ifdef DEBUG
 1043   remove("bare.txt");
 1044 #endif
 1045 
 1046   memset(&hsh, 0, sizeof(htmlSAXHandler));
 1047 
 1048   hsh.startElement = _getpage_startElementSAX;
 1049   hsh.endElement = _getpage_endElementSAX;
 1050   hsh.characters = _getpage_charDataSAX;
 1051 
 1052 
 1053   ctxt = htmlCreatePushParserCtxt(&hsh, &su, NULL, 0, NULL,
 1054          XML_CHAR_ENCODING_UTF8);
 1055   htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER);
 1056 
 1057   curl_global_init(CURL_GLOBAL_ALL);
 1058   su._hnd = curl_easy_init();
 1059   _set_chnd(su._hnd, url, _chunk_parse, ctxt);
 1060   ret = curl_easy_perform(su._hnd);
 1061 
 1062   htmlParseChunk(ctxt, NULL, 0, 1);
 1063   htmlFreeParserCtxt(ctxt);
 1064 
 1065   curl_easy_getinfo(su._hnd, CURLINFO_EFFECTIVE_URL, &su._base_url);
 1066 
 1067 #ifdef DEBUG
 1068   double val;
 1069   if (curl_easy_getinfo(su._hnd, CURLINFO_SPEED_DOWNLOAD, &val) == CURLE_OK)
 1070     printf("Average download speed: %0.3f kbyte/sec.\n", val / 1024);
 1071 #endif
 1072 
 1073   fprintf(stderr, "Downloading files ...\n");
 1074   _download_files(&su);
 1075   curl_easy_cleanup(su._hnd);
 1076   // curl_global_cleanup();
 1077 }

And now the code for the demo program:

    1 #include <stdio.h>
    2 #include <stdarg.h>
    3 
    4 #include "getpage.h"
    5 
    6 
    7 void site_function(void *userdata, const char* format, va_list ap)
    8 {
    9   FILE *fp = userdata;
   10 
   11   vfprintf(fp, format, ap);
   12   fflush(fp);
   13 }
   14 
   15 
   16 int main(int argc, char **argv)
   17 {
   18 
   19   FILE *fp = fopen(argv[2], "w");
   20   if (fp == NULL)
   21     return -1;
   22 
   23   getpage(argv[1], site_function, fp);
   24 
   25   fclose(fp);
   26 }


In the code above you can see how the api approximately will be.
For testing I compiled the code above to getpagetest and tried some websites with this script:

#!/bin/sh


pages="www.google.de www.heise.de www.sueddeutsche.de www.n-tv.de www.golem.de www.faz.net www.cnn.com nytimes.com focus.de www.n24.de apple.com derstandard.at bild.de www.spin.de bbc.co.uk"
for i in $pages
do
        #( ../getpagetest $i "$i.html"  && chromium "$i.html" &)
        valgrind --leak-check=full --log-file="$i.valgrind" ../getpagetest $i "$i.html" &
done
echo -e "\n\nWaiting ...\n\n"
wait `pidof getpagetest`
for i in $pages
do
        echo Opening $i
        chromium $i.html
done
12. Juni 2011 01:06

Tags:  ·  ·  ·  ·  ·  ·  ·  ·

Receive ARP Packets via libpcap

Sonstiges ·

This is how it works:

    1 #include <stdio.h>
    2 #include <stdlib.h>
    3 #include <pcap.h>
    4 #include <netinet/if_ether.h>
    5 
    6 #define BUFSIZE 4096
    7 #define ETH_HEADER_SIZE 14
    8 
    9 #define ERROR_EXIT(x,y) do { fprintf(stderr, "%s: %s\n" x, y); exit(1);}
   10                    while (0);
   11 
   12 void process_packet(u_char *user, const struct pcap_pkthdr *hdr, const
   13                     u_char *pkt)
   14 {
   15   struct ether_header *eth_header;
   16   struct ether_arp *arp_packet; /* from if_eth.h */
   17 
   18   eth_header = (struct ether_header *) pkt;
   19   arp_packet = (struct ether_arp *) (pkt + ETH_HEADER_SIZE);
   20 
   21   if (ntohs (eth_header->ether_type) == ETHERTYPE_ARP)  /* if it is an ARP
   22       packet */
   23     {
   24       printf ("Source: %d.%d.%d.%d\t\tDestination: %d.%d.%d.%d\n",
   25               arp_packet->arp_spa[0],
   26               arp_packet->arp_spa[1],
   27               arp_packet->arp_spa[2],
   28               arp_packet->arp_spa[3],
   29               arp_packet->arp_tpa[0],
   30               arp_packet->arp_tpa[1],
   31               arp_packet->arp_tpa[2],
   32               arp_packet->arp_tpa[3]);
   33     }
   34 
   35 }
   36 
   37 int main()
   38 {
   39   char buf[BUFSIZE];
   40 
   41   bpf_u_int32 netp;
   42   bpf_u_int32 maskp;
   43 
   44 
   45   char *filter = "arp";
   46   struct bpf_program bpf;
   47   char errbuf[PCAP_ERRBUF_SIZE];
   48 
   49   pcap_if_t *alldev;
   50   pcap_t *handle;
   51 
   52   if (pcap_findalldevs(&alldev, errbuf) < 0)
   53     ERROR_EXIT("pcap_findalldevs", errbuf);
   54 
   55   while (alldev != NULL)
   56     {
   57       printf("dev: %s\n", alldev->name);
   58       alldev = alldev->next;
   59     }
   60 
   61   handle = pcap_open_live("wlan0", BUFSIZE, 0, 0, errbuf);
   62   if (handle == NULL)
   63     ERROR_EXIT("pcap_open_live", errbuf);
   64 
   65   if (pcap_lookupnet ("wlan0", &netp, &maskp, errbuf) == -1)
   66     ERROR_EXIT("pcap_lookupnet", errbuf);
   67 
   68   if (pcap_compile(handle, &bpf, filter, 0, maskp) == -1)
   69     ERROR_EXIT("pcap_compile", pcap_geterr(handle));
   70 
   71   if (pcap_setfilter(handle, &bpf) == -1)
   72     ERROR_EXIT("pcap_setfilter", pcap_geterr(handle));
   73 
   74   pcap_freecode(&bpf);
   75 
   76   if (pcap_loop(handle, 10, process_packet, NULL) < 0)
   77     ERROR_EXIT("pcap_setfilter", pcap_geterr(handle));
   78 
   79 
   80 }

based on: http://commons.oreilly.com/wiki/index.php/Network_Security_Tools/Modifying_and_Hacking_Security_Tools/Writing_Network_Sniffers

14. März 2011 02:48

Tags:  ·  ·  ·

Send ARP Request via libnet

Sonstiges · · 1 Kommentar

This is what it looks like:

    1 #include <stdio.h>
    2 #include <libnet.h>
    3 
    4 int main()
    5 {
    6 // libnet
    7   libnet_t *l;
    8   char errbuf[LIBNET_ERRBUF_SIZE];
    9   int ret;
   10 
   11 //arp header
   12   u_int8_t *macaddr;
   13   struct libnet_ether_addr *hwaddr;
   14   libnet_ptag_t arp = 0;
   15 //ip header
   16   in_addr_t srcip, destip;
   17 //eth header
   18   libnet_ptag_t eth = 0;
   19 
   20 
   21   l = libnet_init(LIBNET_LINK, "wlan0", errbuf);
   22 
   23   if (l == NULL)
   24     {
   25       fprintf(stderr, "err: %s\n", errbuf);
   26       exit(1);
   27     }
   28 
   29   hwaddr = libnet_get_hwaddr(l);
   30 
   31   destip = inet_addr("192.168.178.1");
   32   srcip = libnet_get_ipaddr4(l);
   33   macaddr = libnet_hex_aton("ff:ff:ff:ff:ff:ff", &ret);
   34 
   35   arp = libnet_autobuild_arp(ARPOP_REQUEST, (uint8_t*)hwaddr, (uint8_t*)
   36         &srcip, macaddr, (uint8_t*)&destip, l);
   37   eth = libnet_autobuild_ethernet(macaddr, ETHERTYPE_ARP, l);
   38 
   39 
   40   if (libnet_write(l) == -1)
   41     {
   42       fprintf(stderr, "err: %s\n", errbuf);
   43       exit(1);
   44     }
   45 
   46 
   47   return 0;
   48 }

based on: http://commons.oreilly.com/wiki/index.php/Network_Security_Tools/Modifying_and_Hacking_Security_Tools/Writing_Packet-Injection_Tools
unfortunately there is no possibility to receive packets :(

14. März 2011 02:43

Tags:  ·  ·  ·  ·  ·

Current favorite internet radio streams

Sonstiges ·

P.S.: btw. egofm wants you to send them an E-Mail to get the stream url (http://www.egofm.de/default.aspx?ID=6301)

12. März 2011 18:43

Tags:  ·  ·  ·  ·  ·  ·  ·  ·  ·

ipv6 @home

Computer ·

I just tested traceroute6 at my parents internet connection (1und1.de):

 traceroute6 heise.de
traceroute to heise.de (2a02:2e0:3fe:100::8) from 2002:5dc4:8c4a:0:21b:63ff:fe06:99a5, 30 hops max, 24 byte packets
 1  fritz.box (2002:5dc4:8c4a:0:be05:43ff:fe1e:a622)  7.611 ms  2.317 ms  2.425 ms
 2  * * *
 3  te3-1.c302.f.de.plusline.net (2001:7f8::3012:0:2)  18.389 ms  18.804 ms  22.642 ms
 4  te2-4.c102.f.de.plusline.net (2a02:2e0:1::5)  18.808 ms  22.216 ms  22.431 ms
 5  te6-2.c13.f.de.plusline.net (2a02:2e0:1::22)  18.65 ms  17.314 ms  18.374 ms
 6  redirector.heise.de (2a02:2e0:3fe:100::8)  19.172 ms  19.682 ms  21.527 ms

UPDATE: some friends told me that this is probably not native but only a tunnel :(

09. März 2011 23:37

Tags:

Watching TV online - or why you should not trust "X-Forwarded-For"

Sonstiges · · 3 Kommentare

Yesterday, someone tweeted about http://wilmaa.ch - unfortunately this website is not available outside Suisse :(
BUT: there is a PortableApp called "Wilmaa-Portable"


wilmaa windows client


and with it and vlc you can watch TV streams even outside Suisse if you click "Yes" in the "Are you in Suisse?"-Dialog Box. This works great, but it is for windows only :(
So I used wireshark:


wilmaa wireshark


What do I see there? X-Forwarded-For

The X-Forwarded-For (XFF) HTTP header field is a de facto standard for identifying the originating IP address of a client connecting to a web server through an HTTP proxy or load balancer.

http://en.wikipedia.org/wiki/X-Forwarded-For

So, to watch it in Linux, just use:

curl --header "X-Forwarded-For: x.x.x.x" "wilmaa-url" | mplayer -

where x.x.x.x is a suisse geoip (e.g. dig suisse.ch) and wilmaa-url is one of the urls wilmaa PortableApp provides :)

Hint: If you don't want to install Windows, you can just sqlite3 to get the urls out of wilmaa's database.
btw. the hardest thing to get that working was to install windows:

  • Windows 7 couldn't be installed because even if it boots via CD it needs a CD-Driver
  • Windows XP couldn't start wilmaa PortableApp
  • Windows 2008 finally worked, after I figured out the password policies
06. März 2011 17:43

Tags:  ·  ·

Rearmed

Sonstiges ·

15. Januar 2011 22:20

Tags:

Seite 4 von 29
 ·  1 2 3 4 5 6 7..29 · 
Blog durchsuchen
(nur öffentliche Einträge)

Willst du auch bloggen?
Kostenlos bloggen bei Spin.de
Diese Seite ist eine auf spin.de gelagerte persönliche Homepage, deren Verantwortlichkeit beim Nutzer liegt.
spin.de ist eine große Online-Community mit Chat, Blogs, Foren, Online-Spielen und vielem mehr.
Deine eigene Homepage mit Blog und Gästebuch

Impressum · Datenschutz · Sitemap