innercoder.com

A blog for Linux programming enthusiasts

Capturing Websites With Sockets and HTTP

| Comments

This post shows a program that captures web pages’s index.html with sockets. To do this, it creates a TCP client socket, connects to a web server and takes all the data to stdout. The input supports IPv4, IPv6, and hostname resolution.

As simple as it may sound, there are always little things to take into account, which gets you to start investigating about special options and tweaks you can make to your function calls.

Complete code at its github repository.

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
int main(int argc, char *argv[])
{
  u_int trx_fd;
  u_int resv4;
  u_int resv6;
  u_int status = 1;
  char buffer[MAX];
  struct addrinfo hints;      /* DNS resolution processing */
  struct addrinfo *rs;     /* DNS results */
  struct sockaddr_in svr_addr4;
  struct sockaddr_in *pr;
  struct sockaddr_in6 svr_addr6;
  struct timeval tt_wait;     /* used for recv() timeout */
  tt_wait.tv_sec = 3;      /* 3 secs timeout */
  tt_wait.tv_usec= 500000;

  if(argc != 2) {
      usage(argv[0]);
      exit(0);
  }

  /* initializing addrinfo structure list */
        hints.ai_flags = AF_INET;
        hints.ai_family = AF_UNSPEC;
        hints.ai_socktype = SOCK_STREAM;
        hints.ai_protocol = 0;
        hints.ai_addrlen = 0;
        hints.ai_canonname = NULL;
        hints.ai_addr = NULL;
        hints.ai_next = NULL;

  /* get IP info */
  resv4 = inet_pton(AF_INET, argv[1], &svr_addr4.sin_addr);
  resv6 = inet_pton(AF_INET6, argv[1], &svr_addr6.sin6_addr);

  /* entering http server presets in structure */
  svr_addr4.sin_port =  htons((in_port_t) 80);
  svr_addr4.sin_family = AF_INET;
  svr_addr6.sin6_port =  htons((in_port_t) 80);
  svr_addr6.sin6_family = AF_INET6;
  sprintf(buffer, "GET / HTTP/1.1\r\n\r\n");
      
  /* checking for either IPv4 or IPv6 entered input */
  while (status == 1) {
      if(resv4 > 0) {
          if((trx_fd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
              err_msg("Error creating socket");
          if ((setsockopt(trx_fd, SOL_SOCKET, SO_RCVTIMEO,
                  (struct timeval *)&tt_wait,
                  sizeof(struct timeval))) == -1)
              err_msg("setsockopt()");
          /* get out of the loop */
          status = 0;
          get_http_ipv4(&trx_fd, &svr_addr4, buffer);

      } else if(resv6 > 0) {
          if((trx_fd = socket(AF_INET6, SOCK_STREAM, 0)) == -1)
              err_msg("Error creating socket");
          if ((setsockopt(trx_fd, SOL_SOCKET, SO_RCVTIMEO,
                  (struct timeval *)&tt_wait,
                  sizeof(struct timeval))) == -1)
              err_msg("setsockopt()");

          /* get out of the loop */
          status = 0;
          get_http_ipv6(&trx_fd, &svr_addr6, buffer);

      } else {
          if ((getaddrinfo(argv[1], 0, &hints, &rs)) != 0)
              err_msg(gai_strerror(status));
          resv4 = 1;
          for(; rs != NULL; rs = rs->ai_next){
              if(rs->ai_family == AF_INET) {
                  /* typecasting result and assigning */
                  pr =  (struct sockaddr_in *)rs->ai_addr;
                  svr_addr4.sin_addr = pr->sin_addr;
                  break;
              }
          }
          /* freeing allocated struct rs memory */
          freeaddrinfo(rs);       
      }
  }
  close(trx_fd);
  exit(EXIT_SUCCESS);
}

The main function program. Takes care of the socket structure initialization, hostname resolution structure allocations, and center point of the program’s execution. There are two execution paths for the socket creation which is determined by the user’s input of either IPv4 or IPv6.

1
2
3
4
5
6
7
8
9
10
11
if ((getaddrinfo(argv[1], 0, &hints, &rs)) != 0)
              err_msg(gai_strerror(status));
          resv4 = 1;
          for(; rs != NULL; rs = rs->ai_next){
              if(rs->ai_family == AF_INET) {
                  /* typecasting result and assigning */
                  pr =  (struct sockaddr_in *)rs->ai_addr;
                  svr_addr4.sin_addr = pr->sin_addr;
                  break;
              }
}

This is the section that handles the actual hostname resolution. After initializing the hints structure this function takes care of the resolution for you. After that you get a structure with the results, you can choose from IPv4 and IPv6 addresses for the host you are connecting too. Pretty neat.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/* process transmission with IPv6 */
void get_http_ipv6(const u_int *fd, struct sockaddr_in6 *saddr6, char *buff)
{
  u_int nrecv;
  u_int trx_fd = *fd;
  struct sockaddr_in6 addr6;

  memcpy(&addr6, saddr6, sizeof(*saddr6));
  if(connect(trx_fd, (struct sockaddr *) &addr6, sizeof(addr6)) == -1)
      err_msg("connect()");
  else {
      printf("Connect success\n");
      
      /* start transmission */
      send_msg(trx_fd, buff);
      while((nrecv = recv_msg(trx_fd, buff, MAX)) > 0) {
          printf("%s", buff);
          memset(buff, 0 , MAX);
      }
  }
}

/* process transmission with IPv4 */
void get_http_ipv4(const u_int *fd, struct sockaddr_in *saddr4, char *buff)
{
  u_int nrecv;
  u_int trx_fd = *fd;
  struct sockaddr_in addr4;

  memcpy(&addr4, saddr4, sizeof(*saddr4));
  if(connect(trx_fd, (struct sockaddr *) &addr4, sizeof(addr4)) == -1)
      err_msg("connect()");
  else {
      printf("Connect success\n");

      /* start transmission */
      send_msg(trx_fd, buff);
      while((nrecv = recv_msg(trx_fd, buff, MAX)) > 0) {
          printf("%s", buff);
          memset(buff, 0 , MAX);
      }
  }
}

These are the functions that handle the obtention of html data after host IP is processed. You can see there’s no much difference to handling IPv4 and IPv6.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
[jaime@LenoLX get_http]$ ./get_http google.com
Connect success
HTTP/1.1 200 OK
Date: Fri, 08 Jul 2016 02:41:29 GMT
Expires: -1
Cache-Control: private, max-age=0
Content-Type: text/html; charset=ISO-8859-1
P3P: CP="This is not a P3P policy! See https://www.google.com/support/accounts/answer/151657?hl=en for more info."
Server: gws
X-XSS-Protection: 1; mode=block
X-Frame-Options: SAMEORIGIN
Set-Cookie: NID=81=c8P0vCMGE0EVBx1CcgD_L0zbXecKEkHGvp3-WqJbLPsYUnMlYqFG-pvG_51WUYGdO5r1uNHvZYBnDGaWia04G6bl_htmxu_Jn5_n2xqeUOJrr8TwIxRuSUw-eW76yrKomwKpCGFPhmQx6Ts; expires=Sat, 07-Jan-2017 02:41:29 GMT; path=/; domain=.google.com; HttpOnly
Accept-Ranges: none
Vary: Accept-Encoding
Transfer-Encoding: chunked

<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8"

.....


from your \u003Ca href=\"/history\"\u003EWeb History\u003C/a\u003E","psrl":"Remove","sbit":"Search by image","srch":"Google Search"},"nds":true,"ovr":{},"pq":"","refpd":true,"rfs":[],"scd":10,"sce":5,"stok":"u8v0kV7gXuv6UGgEJLZk39LT_Ww"},"d":{}};google.y.first.push(function(){if(google.med){google.med('init');google.initHistory();google.med('history');}});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);}
</script></div></body></html>
0

The program works as expected, above is an example of its functioning.

I still have to think of other features for this program. One of them could be to extent the input from just getting index.html. Or download a complete website to a specified directory. Apart from that I will also be working to test for correct input.

More to come.

Comments