Home > Back-end >  C the crawler collapse problem
C the crawler collapse problem

Time:09-19

The crawler is to crawl the picture
Spiders crawl baidu can climb more than one hundred pages, but climb sina the second direct collapse, stack call to tcpconnect: : tcpclose () of delet [] or closesocket (), delete [] can also understand, may be the server returns the HTTP response caused by error parsing cross-border operation by me, but closesocket () collapse couldn't understand, stack display is operating in NTDLL conflict,
The entire program is the general idea of a thread to crawl web pages, and then the img tags in web HTML documents and save to the url and img href item extracted two vector, and then in another thread download img vector images, socket used is blocked,
H engine.
 # pragma once 
# include "TCP. H"
#include
#include
#include
using namespace std;
The class mainclass
{
Public:
VectorVectorMainclass starturl (char *);
~ mainclass ();
Void gethtml ();
Void getimg ();
Tcpconnect * httpget;
Tcpconnect * imgget;
Bool threadflag;
Bool urlthreadinf;
Bool imgthreadinf;
Void htmlanalyze (char * doc, int docsize);
Void imganalyze (char * doc, int docsize);
Protected:
The static DWORD WINAPI urlFun (LPVOID a);
The static DWORD WINAPI imgFun (LPVOID a);
Private:
Bool checkurl check (char *);
CRITICAL_SECTION * imgvectorlock;
};

TCP. H
 # pragma once 
# include "TCP. H"
#include
#include
#include
using namespace std;
The class mainclass
{
Public:
VectorVectorMainclass starturl (char *);
~ mainclass ();
Void gethtml ();
Void getimg ();
Tcpconnect * httpget;
Tcpconnect * imgget;
Bool threadflag;
Bool urlthreadinf;
Bool imgthreadinf;
Void htmlanalyze (char * doc, int docsize);
Void imganalyze (char * doc, int docsize);
Protected:
The static DWORD WINAPI urlFun (LPVOID a);
The static DWORD WINAPI imgFun (LPVOID a);
Private:
Bool checkurl check (char *);
CRITICAL_SECTION * imgvectorlock;
};

Engine. CPP
 # include "engine. H" 
#include
#include
using namespace std;
Char * getfilename (char * url)
{
int i=0;
Char * lp=url + strlen (url) - 1;
Bool flag=true;
for (; Flag; Lp -)
{
If (* lp=='/')
flag=false;
If (* lp=='\ \')
flag=false;
If (=='* lp & lt; ')
flag=false;
If (=='* lp & gt; ')
flag=false;
If (* lp=='\')
"flag=false;
If (* lp=='? ')
flag=false;
If (* lp=='*')
flag=false;
i++;
If (I & gt; 40)
flag=false;
}
Lp++;
Return the lp;
}
Mainclass: : mainclass (char * starturl)
{
Url. The push_back (starturl);
Httpget=new tcpconnect ();
Imgget=new tcpconnect ();
Threadflag=true;
Urlthreadinf=true;
Imgvectorlock=new CRITICAL_SECTION ();
InitializeCriticalSection (imgvectorlock);
CreateThread (NULL, 0, mainclass: : urlFun, this, 0, NULL);
CreateThread (NULL, 0, mainclass: : imgFun, this, 0, NULL);
}

Mainclass: : ~ mainclass ()
{
The delete httpget;
The delete imgget;
DeleteCriticalSection (imgvectorlock);
}

Void mainclass: : gethtml ()
{
Int I=0, len, size;
Char * tempurl;
While (threadflag)
{
Tempurl=url. At (I);
Httpget - & gt; Seturl (tempurl);
If (httpget - & gt; Myconnect ()==false)
{
i++;
Httpget - & gt; Tcpclose ();
If (I & gt; Url. The size ()
{
Urlthreadinf=false;
return;
}
continue;
}
Len=httpget - & gt; Makehttpgetmsg ();
Httpget - & gt; Tcpsend (httpget - & gt; Getmsg, len);
Size=httpget - & gt; Recvhtml ();
If (size==1)
{
i++;
Httpget - & gt; Tcpclose ();
If (I & gt; Url. The size ()
{
Urlthreadinf=false;
return;
}
continue;
}
Htmlanalyze (httpget - & gt; Buf, size);
Imganalyze (httpget - & gt; Buf, size);
i++;
Cout & lt; <"Urlcount:" & lt; If (I & gt;=url. The size ()
{
Urlthreadinf=false;
Httpget - & gt; Tcpclose ();
return;
}
Httpget - & gt; Tcpclose ();
}
}

Void mainclass: : getimg ()
{
While (threadflag)
{
Int len, size;
Char * tempurl;
While (threadflag)
{
The EnterCriticalSection (imgvectorlock);
If (img. Empty ())
{
LeaveCriticalSection (imgvectorlock);
continue;
}
Tempurl=img. At (0);
LeaveCriticalSection (imgvectorlock);
Imgget - & gt; Seturl (tempurl);
If (imgget - & gt; Myconnect ()==false)
{
The EnterCriticalSection (imgvectorlock);
Img. Erase (img. The begin ());
LeaveCriticalSection (imgvectorlock);
Imgget - & gt; Tcpclose ();
continue;
}
Len=imgget - & gt; Makehttpgetmsg ();
Imgget - & gt; Tcpsend (imgget - & gt; Getmsg, len);
Size=imgget - & gt; Recvhtml ();
If (size==1)
{
The EnterCriticalSection (imgvectorlock);
Img. Erase (img. The begin ());
LeaveCriticalSection (imgvectorlock);
Imgget - & gt; Tcpclose ();
continue;
}
If (STRSTR (imgget - & gt; The head, "OK"))
{
nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
  • Related