Chris Umbel

C from erlang via linked-in driver

Erlang Erlang truly is a useful language. It's fast, full-featured and elegant. Like all high-level languages, however, you sometimes need to access legacy or system code written in another language like, you guessed it, C.

I came accross this situation recently when porting my webcrawler to erlang. Under pressure http:request proved to be unreliable when used in a multi-threaded scenario. The solution I chose was to bridge erlang with libcurl by way of an erlang "linked-in driver". While I only used it for retrieving web page content into memory I figure I'll be able to wrap all of libcurl in the future.

C

I'll start out with some C code that will function as the bridge between erlang and libcurl. This is the code that qualifies as a "linked-in" driver. Like all C code there's plenty of plumbing. See my comments for explanation.

Note that I'm using curl's multi interface which doesn't block on I/O.

#include "erl_driver.h"
#include "ei.h"
#include <stdio.h>
#include <curl/curl.h>
#include <string.h>
#include <stdlib.h>

typedef struct 
{
  ErlDrvPort port;
} eurl_data;

/* erlang's firing us up */
static ErlDrvData eurl_drv_start(ErlDrvPort port, char *buff)
{
  eurl_data* d = (eurl_data*)driver_alloc(sizeof(eurl_data));
  d->port = port;
  return (ErlDrvData)d;
}

/* erlang's done with us */
static void eurl_drv_stop(ErlDrvData handle)
{
  driver_free((char*)handle);
}

/* basic data structure that contains the content of a web page */
struct document
{
  char *memory;
  size_t size;
};

static void *myrealloc(void *ptr, size_t size)
{
  if(ptr)
    return realloc(ptr, size);
  else
    return malloc(size);
   
}

/* write data into a structure for erlang */
static size_t write_chunk(void *ptr, size_t size, size_t nmemb, void *data)
{
  size_t realsize = size * nmemb;
  struct document *mem = (struct document *)data;
     
  mem->memory = myrealloc(mem->memory, mem->size + realsize + 1);
   
  if (mem->memory) 
    {
      memcpy(&(mem->memory[mem->size]), ptr, realsize);
      mem->size += realsize;
      mem->memory[mem->size] = 0;
    }
   
  return realsize;
}

/* here's the business end.  don't be intimidated.  most of this code is 
just to satisfy libcurl and isn't plumbing required to talk to erlang.  most of what you'll be interested in is the very beginning and the end. */
static void eurl_drv_output(ErlDrvData handle, char *buff, int bufflen)
{
  eurl_data* d = (eurl_data*)handle;     /* <--ERLANG STUFF */  
  CURLM *multi_handle;
  CURL *curl;
  int is_running;

  struct document doc;
  doc.memory = NULL;
  doc.size = 0;

  /* grab the url from erlang and put 
     it in a NULL terminated string */  
  char* url = malloc(bufflen + 1);  
  memset(url, '\0', bufflen + 1);  
  strncpy(url, buff, bufflen); 

  /* initialize curl for non-blocking multi interface */  
  curl_global_init(CURL_GLOBAL_ALL);
  curl = curl_easy_init();
  multi_handle = curl_multi_init();

  if(curl && multi_handle) {
    /* connect the multi interfact to easy interface */
    curl_multi_add_handle(multi_handle, curl);

    /* tell curl what we indend to do */  
    curl_easy_setopt(curl, CURLOPT_URL, url);
    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_chunk);
    curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&doc);

    while(CURLM_CALL_MULTI_PERFORM == curl_multi_perform(multi_handle, &is_running));

    while(is_running) {
      struct timeval timeout;
      int rc;

      fd_set fdread;
      fd_set fdwrite;
      fd_set fdexcep;
      int maxfd = -1;

      long curl_timeo = -1;

      FD_ZERO(&fdread);
      FD_ZERO(&fdwrite);
      FD_ZERO(&fdexcep);

      /* assign a timeout for the operation */
      timeout.tv_sec = 10;
      timeout.tv_usec = 0;

      curl_multi_timeout(multi_handle, &curl_timeo);

      if(curl_timeo >= 0) {
        timeout.tv_sec = curl_timeo / 1000;
	if(timeout.tv_sec > 1)
          timeout.tv_sec = 1;
        else
          timeout.tv_usec = (curl_timeo % 1000) * 1000;
      }

      /* ask curl for the file descriptors */
      curl_multi_fdset(multi_handle, &fdread, &fdwrite, &fdexcep, &maxfd);

      /* ask OS for their status */
      rc = select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout);

      switch(rc) {
      case -1:
        break;
      case 0:
      default:
        while(CURLM_CALL_MULTI_PERFORM ==
              curl_multi_perform(multi_handle, &is_running));

        break;
      }
    }

    curl_easy_cleanup(curl);
    curl_multi_cleanup(curl);

    /* !!! ERLANG STUFF !!! */  
    /* send data back to erlang */  
    driver_output(d->port, doc.memory, doc.size);  
  } else  
    driver_output(d->port, "", 0);  

  printf("%d, %s", (int)doc.size, doc.memory);
      
  if(doc.memory)
    free(doc.memory);

  free(url);
}

/* mapping of the drivers functions */
static ErlDrvEntry eurl_driver_entry = 
  {
    NULL,
    eurl_drv_start,
    eurl_drv_stop, 
    eurl_drv_output,
    NULL,
    NULL,
    "eurl_drv",
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    ERL_DRV_EXTENDED_MARKER,
    ERL_DRV_EXTENDED_MAJOR_VERSION,
    ERL_DRV_EXTENDED_MAJOR_VERSION,
     ERL_DRV_FLAG_USE_PORT_LOCKING
  };

DRIVER_INIT(eurl_drv)
{   
  return &eurl_driver_entry;
}

Build

The next task is to build the driver as a shared library.

gcc -o eurl_drv.so -L/usr/lib/erlang/lib/erl_interface-3.5.7 -I/usr/lib/erlang/lib/erl_interface-3.5.7/include/ -fpic -rdynamic -shared -fno-common eurl.c -lerl_interface -lei -lcurl

Erlang wrapper

Depending on your point of view the hard part is now done. Now I'll slap a little thread safe erlang wrapper together.

-module(eurl).
-export([start/0, curl/1]).

%% initialize the driver
start() ->
%% load the C code
%% note the name of the library is 
%% eurl_drv in the current directory
  case erl_ddll:load_driver(".", eurl_drv) of
    ok -> ok;
    {error, already_loaded} -> ok;
    _ -> exit({error, could_not_load_driver})
  end,
  
  register(eurl_proc, self()).
  
%% perform a curl request and return string data
curl(Url) ->
%% start the conversation with the driver
  P = open_port({spawn, eurl_drv}, [binary]),
%% instruct the driver to execute the curl request
  port_command(P, [list_to_binary(Url)]),
%% read the response
  receive
    {_, {_, BinaryData}}  ->
      port_close(P)
  end,

%% convert to list and return
  binary_to_list(BinaryData).

Usage

Now let's test it from the erlang shell.

1> eurl:start().
2> Body = eurl:curl("http://erlang.org/").  

And as you'd have guessed the full HTML of erlang.org is now in the Body variable.

Wed Sep 16 2009 21:09:00 GMT+0000 (UTC)

11 Comments Comment Feed - Permalink
Nice intro, except for the 'noop' thing which is pretty ugly actually :) The rest was a really nice intro though. /Mazen
by Mazen Harake on Fri Nov 06 2009 09:11:54 GMT+0000 (UTC)
Hey, thanks for the feedback.  You're certainly right about that.  Can't imagine I intended to leave it that way.  I'm certainly cleaning that up.
by chrisumbel on Wed Nov 11 2009 09:11:41 GMT+0000 (UTC)
I've heard of another project using libcurl with its erlang crawler. Did you try ibrowse? Is there any theoretical benefit to libcurl over ibrowse (or any native erlang client, assuming it works)? How was the http module failing under pressure?
by Michael Terry on Tue Jun 22 2010 00:06:46 GMT+0000 (UTC)
eee, it's been a while and have had little reason to look at the crawler in months but I dealt with some kind of persistent failure (I realize that's not useful information:)) that exhibited itself with extreme parallelization (which is why I went to erlang in the first place).    

I also do recall mucking with ibrowse and ultimately dismissing it while prototyping the project for some reason or another.    

Maybe over the next week or so I'll dig through the project and see if I can refresh my memory.  Believe-you-me I'd love to keep that as pure-erlang as possible and, assuming it worked, I'd prefer a pure-erlang implementation.

Also, keep in mind the purpose of the article.  It's a description of how to write a linked-in driver in general, not http fetching.  The curl-based http fetch was just an example payload even if it turns out to be contrived.
by chrisumbel on Wed Jun 23 2010 04:06:19 GMT+0000 (UTC)
Oh, sure, no problem. I was just curious. As a linked-in driver example, it's great, and actually more interesting to me than others I've seen.
by Michael Terry on Thu Jun 24 2010 00:06:13 GMT+0000 (UTC)
It seems that C I/O functions block OS thread rather than erlang process. So, concurrent application of eurl:curl() seems to be serialized. Running ex1:run() with large concurreny (M) and slow respnding URL (Url) in following code illustrate this problem. ("curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10);" in eurl.c kills Erlang VM on slow page due to timeout. So, running this test requires it disabled.)

-module(ex1).
-compile(export_all).

run(M, N, Url) ->
    eurl:start(),
    spawn_testers(M, N, Url),
    receive _ -> ok end.

spawn_testers(0, _, _) -> ok;
spawn_testers(M, N, Url) ->
    spawn(?MODULE, tester, [N, Url]),
    spawn_testers(M - 1, N, Url).

tester(N, Url) ->
    io:format("Start tester: ~p ~p~n", [now(), self()]),
    tester_loop(N, Url).

tester_loop(0, _) -> ok;
tester_loop(N, Url) ->
    eurl:curl(Url),
    io:format("~p: ~p ~p~n", [N, self(), now()]),
    tester_loop(N - 1, Url).
by cignos on Fri Jul 23 2010 08:07:41 GMT+0000 (UTC)
Hi Chris,

You said: "I've heard of another project using libcurl with its erlang crawler". Which project is it?
by Michael on Sun Aug 01 2010 17:06:36 GMT+0000 (UTC)
Hi cignos,

You note about blocking the OS thread is interesting.
How can we fix Chris's C code then?
by Michael on Sun Aug 01 2010 17:09:05 GMT+0000 (UTC)
Hi, Michael,

Well, I searched the solution in Internet and as a matter of course, I found it is provided erlang thread wrapper version of blocking C functions, as in GNU pth, State Threads Library, etc.
by cignos on Tue Aug 03 2010 03:06:51 GMT+0000 (UTC)
Michael, it was actually another poster who mentioned another project using libcurl.  I too am interested in knowing which project.

Starting to look like I need threaded comments around here:)
by chrisumbel on Fri Aug 06 2010 12:04:44 GMT+0000 (UTC)
While it should still be considered a work in progress I changed this to use libcurl's multi interface to avoid blocking on I/O.
by chrisumbel on Tue Sep 21 2010 01:59:14 GMT+0000 (UTC)
Add a comment
Name
E mail (Private)
URL
Follow Chris
RSS Feed
Twitter
Facebook
CodePlex
github
LinkedIn
Google