本网站仅单台服务器: 2核2G 99￥包年，集成兼容 MySQL 的 MyTopling 高压缩高性能数据库

本网站仅单台服务器: MyTopling 2核2G 99￥包年

当搜索词中有错别字时，搜索引擎会尝试纠错

通过相似拼音纠错

搜索引擎把这些字还原成拼音，用一个拼音相同的已知的搜索词代替。

这是一种众所周知的纠错策略，但是，当输错的字是多音字，特别是有多个这样的错误输入时，所有的搜索引擎都尽量绕开这个问题，或者仅使用最常用的那些音去纠错。因为要考虑所有可能的拼音组合，在极端情况下会导致指数爆炸！例如某互联网大厂的实现(枚举多音字全排列)。

基于自动机的算法可以完美解决这个指数爆炸问题

这是自动机应用的又一个绝佳范例，作为演示，这个页面只收录了 800万 条 搜索词+词频，数据也不太干净
该算法全部在内存中运行，使用了 293M 内存，这个数据量，如果用传统方法暴力实现，并且达到这个性能，需要 几十G 的内存
暴力方法是 Query 越长越可怕，该算法则是 Query 越长，优势越大
纠错耗时仅供参考(2核2G 99￥包年)，如果你看到搜索耗时过长，很可能是 mmap 数据被 swap 到了硬盘上，再搜索一次会得到客观的搜索耗时

这个算法也可以用来解决用户输入预测(智能提示)功能

用户只输入Query开头部分，就自动提示出整个Query，例如用户输入举头望，就提示出举头望明月。就像现在各种搜索引擎做的那样。

基于编辑距离的纠错

在已知的搜索词中寻找编辑距离与用户 Query 最小的词，使用我的算法也可以高效解决（还没做演示页面）

创建 DFA Key 与搜索 DFA Key 的耗时包含了收集网页展示需要的信息，耗时占比90%以上！

MapReduce做了多余的事情

作者: rockeet 发表日期: 2009年10月27日分类: MapReduce 评论: 0 条阅读次数: 3,496 次

本文假定读者已了解MapReduce。

先说 Map

Map阶段一般做三件事情：

继续阅读 →

MapReduce Key Revert ——特定数据模式的负载均衡

作者: rockeet 发表日期: 2009年10月27日分类: MapReduce 评论: 0 条阅读次数: 3,349 次

符号、记法

其中{k,v}指一个Key,Value对，{..} 中第一个分量是Key，第二个是Value。

[e]指一个集合，其中的元素为e。 [{k,v}]就指一个{k,v}的集合。

继续阅读 →

memory pool 的高效实现（代码）

作者: rockeet 发表日期: 2009年10月23日分类: C++ 评论: 2 条阅读次数: 3,105 次

mpool.h

#ifndef __febird_c_mpool_h__
#define __febird_c_mpool_h__

#ifdef __cplusplus
extern "C" {
#endif

//------------------------------------------------------------------------------------------
typedef void* (*chunk_alloc_ft)(struct chunk_allocator* self, size_t size);
typedef void* (*chunk_realloc_ft)(struct chunk_allocator* self, void* block, size_t olc_size, size_t new_size);
typedef void  (*chunk_free_ft)(struct chunk_allocator* self, void* block, size_t size);

struct chunk_allocator
{
	void* (*chunk_alloc)(struct chunk_allocator* self, size_t size);
	void* (*chunk_realloc)(struct chunk_allocator* self, void* block, size_t olc_size, size_t new_size);
	void  (*chunk_free)(struct chunk_allocator* self, void* block, size_t size);
};

struct fixed_mpool
{
	struct chunk_allocator* ca;
	struct mpool_chunk* pChunks;
	struct mpool_cell*  head;
	size_t iNextChunk;
	size_t nChunks;
	size_t cell_size;
	size_t chunk_size;
	size_t used_cells;
//	size_t free_cells;
};

// 是否允许 mpool 分配超出 max_cell_size 的内存块
//#define FEBIRD_MPOOL_ALLOW_BIG_BLOCK
#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK
struct big_block_header
{
	struct big_block_header *next, *prev;
};
#endif

struct mpool
{
//------------------------------------------------------------------------------------------
/// export a chunk_allocator interface
	void* (*chunk_alloc)(struct chunk_allocator* self, size_t size);
	void* (*chunk_realloc)(struct chunk_allocator* self, void* block, size_t olc_size, size_t new_size);
	void  (*chunk_free)(struct chunk_allocator* self, void* block, size_t size);
//------------------------------------------------------------------------------------------

	/// chunk_allocator for this mpool self
	struct chunk_allocator* ca;

	struct fixed_mpool* fixed;
	size_t max_cell_size;
	size_t chunk_size;
#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK
	size_t big_blocks; // size > max_cell_size, use malloc, this is rare case
	struct big_block_header big_list;
#endif
};

void fixed_mpool_init(struct fixed_mpool* mpf);
void fixed_mpool_destroy(struct fixed_mpool* mpf);

void* fixed_mpool_alloc(struct fixed_mpool* mpf);
void fixed_mpool_free(struct fixed_mpool* mpf, void* ptr);


void mpool_init(struct mpool* mp);
void mpool_destroy(struct mpool* mp);

void* mpool_alloc(struct mpool* mp, size_t size);
void mpool_free(struct mpool* mp, void* ptr, size_t size);

size_t mpool_used_cells(const struct mpool* mp);
size_t mpool_used_bytes(const struct mpool* mp);

#ifdef __cplusplus
}
#endif

#endif // __febird_c_mpool_h__

#ifndef __febird_c_mpool_h__

#define __febird_c_mpool_h__

#ifdef __cplusplus

extern "C" {

#endif

//------------------------------------------------------------------------------------------

typedef void* (*chunk_alloc_ft)(struct chunk_allocator* self, size_t size);

typedef void* (*chunk_realloc_ft)(struct chunk_allocator* self, void* block, size_t olc_size, size_t new_size);

typedef void (*chunk_free_ft)(struct chunk_allocator* self, void* block, size_t size);

struct chunk_allocator

{

void* (*chunk_alloc)(struct chunk_allocator* self, size_t size);

void* (*chunk_realloc)(struct chunk_allocator* self, void* block, size_t olc_size, size_t new_size);

void (*chunk_free)(struct chunk_allocator* self, void* block, size_t size);

};

struct fixed_mpool

{

struct chunk_allocator* ca;

struct mpool_chunk* pChunks;

struct mpool_cell* head;

size_t iNextChunk;

size_t nChunks;

size_t cell_size;

size_t chunk_size;

size_t used_cells;

// size_t free_cells;

};

// 是否允许 mpool 分配超出 max_cell_size 的内存块

//#define FEBIRD_MPOOL_ALLOW_BIG_BLOCK

#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK

struct big_block_header

{

struct big_block_header *next, *prev;

};

#endif

struct mpool

{

//------------------------------------------------------------------------------------------

/// export a chunk_allocator interface

void* (*chunk_alloc)(struct chunk_allocator* self, size_t size);

void* (*chunk_realloc)(struct chunk_allocator* self, void* block, size_t olc_size, size_t new_size);

void (*chunk_free)(struct chunk_allocator* self, void* block, size_t size);

//------------------------------------------------------------------------------------------

/// chunk_allocator for this mpool self

struct chunk_allocator* ca;

struct fixed_mpool* fixed;

size_t max_cell_size;

size_t chunk_size;

#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK

size_t big_blocks; // size > max_cell_size, use malloc, this is rare case

struct big_block_header big_list;

#endif

};

void fixed_mpool_init(struct fixed_mpool* mpf);

void fixed_mpool_destroy(struct fixed_mpool* mpf);

void* fixed_mpool_alloc(struct fixed_mpool* mpf);

void fixed_mpool_free(struct fixed_mpool* mpf, void* ptr);

void mpool_init(struct mpool* mp);

void mpool_destroy(struct mpool* mp);

void* mpool_alloc(struct mpool* mp, size_t size);

void mpool_free(struct mpool* mp, void* ptr, size_t size);

size_t mpool_used_cells(const struct mpool* mp);

size_t mpool_used_bytes(const struct mpool* mp);

#ifdef __cplusplus

}

#endif

#endif // __febird_c_mpool_h__

mpool.c

#include "stdafx.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include "mpool.h"

/* must be power of 2 and greater than sizeof(void*) */
#define MPOOL_MIN_CELL 8

#define MPOOL_MIN_CHUNK 256

struct mpool_cell
{
	struct mpool_cell* next;
};

struct mpool_chunk
{
	struct mpool_cell* cell; // cell array
	size_t size; // size in bytes;
};

/**********************************************************************************/
/**
 * chunk_allocator use malloc/free
 */
static void*
chunk_malloc_alloc(struct chunk_allocator* , size_t size)
{
	return malloc(size);
}

static void 
chunk_malloc_free(struct chunk_allocator* , void* block, size_t size)
{
	return free(block);
}

static void*
chunk_malloc_realloc(struct chunk_allocator* , void* block, size_t old_size, size_t new_size)
{
	return realloc(block, new_size);
}
/**********************************************************************************/

void* 
default_chunk_realloc(struct chunk_allocator* ca, void* ptr, size_t old_size, size_t new_size)
{
	void* q = ca->chunk_alloc(ca, new_size);
	assert(old_size > 0);
	assert(new_size > 0);
	if (NULL == q) return NULL;
	memcpy(q, ptr, old_size < new_size ? old_size : new_size);
	ca->chunk_free(ca, ptr, old_size);
	return q;
}

static void 
mpool_chunk_init(struct mpool_cell* cell, size_t cell_count, size_t cell_size)
{
	size_t i;
	struct mpool_cell* p = cell;
	assert(cell_size % MPOOL_MIN_CELL == 0);
	for (i = 0; i < cell_count-1; ++i)
		p = p->next = (struct mpool_cell*)((char*)p + cell_size);
	p->next = NULL;
}

/**********************************************************************************/
static struct chunk_allocator fal = {
	&chunk_malloc_alloc,
	&chunk_malloc_realloc,
	&chunk_malloc_free
};

/**********************************************************************************/
/**
 * require initialized fields:
 *	 cell_size
 *	 chunk_size
 *	 ca [0 OR initialized]
 */
void fixed_mpool_init(struct fixed_mpool* mpf)
{
	struct chunk_allocator* al;
	if (NULL == mpf->ca)
		al = mpf->ca = &fal;
	else {
		al = mpf->ca;
		assert(NULL != al->chunk_alloc);
		assert(NULL != al->chunk_free);
		if (NULL == al->chunk_realloc)
			al->chunk_realloc = &default_chunk_realloc;
	}
	assert(mpf->chunk_size > 0);
	assert(mpf->cell_size > 0);
	assert(mpf->cell_size < mpf->chunk_size);

	mpf->cell_size = (mpf->cell_size + MPOOL_MIN_CELL - 1) / MPOOL_MIN_CELL * MPOOL_MIN_CELL;
	mpf->chunk_size = (mpf->chunk_size + MPOOL_MIN_CHUNK - 1) / MPOOL_MIN_CHUNK * MPOOL_MIN_CHUNK;

	mpf->head = NULL;

	if (mpf->nChunks < MPOOL_MIN_CHUNK/sizeof(struct mpool_chunk))
		mpf->nChunks = MPOOL_MIN_CHUNK/sizeof(struct mpool_chunk);

	mpf->iNextChunk = 0;
	mpf->pChunks = (struct mpool_chunk*)
		al->chunk_alloc(al, sizeof(struct mpool_chunk) * mpf->nChunks);
	if (NULL == mpf->pChunks) {
		fprintf(stderr
			, "fatal: febird.fixed_mpool_init failed, chunk[size=%zd, count=%zd]/n"
			, mpf->chunk_size, mpf->nChunks);
		abort();
	}
	mpf->used_cells = 0;
}

void fixed_mpool_destroy(struct fixed_mpool* mpf)
{
	struct chunk_allocator* ca = mpf->ca;
	long i;
	for (i = mpf->iNextChunk - 1; i >= 0; --i)
		ca->chunk_free(ca, mpf->pChunks[i].cell, mpf->pChunks[i].size);

	ca->chunk_free(ca, mpf->pChunks, sizeof(struct mpool_chunk) * mpf->nChunks);
}

// 0 success, -1 fail
int fixed_mpool_add_chunk(struct fixed_mpool* mpf)
{
	struct mpool_cell* cell;
	
	assert(mpf->pChunks != NULL);

	if (mpf->iNextChunk == mpf->nChunks) {
		size_t old_size = sizeof(struct mpool_chunk) * mpf->nChunks;
		size_t new_size = 2 * old_size;

		// allocate new chunk array
		struct mpool_chunk* c = (struct mpool_chunk*)
			mpf->ca->chunk_realloc(mpf->ca, mpf->pChunks, old_size, new_size);

		if (NULL == c) return -1;
		mpf->pChunks = c;
		mpf->nChunks *= 2;     // chunk array expanded 2
		mpf->chunk_size *= 2;  // chunk_size  expanded 2 also
	}

	// allocate a new cell array
	cell = (struct mpool_cell*)mpf->ca->chunk_alloc(mpf->ca, mpf->chunk_size);
	if (NULL == cell) return -1;
	mpf->pChunks[mpf->iNextChunk].cell = cell;
	mpf->pChunks[mpf->iNextChunk].size = mpf->chunk_size;
	mpf->iNextChunk++;
	mpool_chunk_init(cell, mpf->chunk_size / mpf->cell_size, mpf->cell_size);
	mpf->head = cell;
	
	return 0;
}

void* fixed_mpool_alloc(struct fixed_mpool* mpf)
{
	struct mpool_cell* cell;

	if (NULL == mpf->head) {
		// in most case it will not run this path

		if (fixed_mpool_add_chunk(mpf) != 0)
			return NULL;
	}
	cell = mpf->head;
	mpf->used_cells++;
	mpf->head = mpf->head->next;
	return cell;
}

void fixed_mpool_free(struct fixed_mpool* mpf, void* ptr)
{
	struct mpool_cell* cell = (struct mpool_cell*)ptr;
	cell->next = mpf->head;
	mpf->used_cells--;
	mpf->head = cell;
}

/**********************************************************************************/
/**
 * require initialized fields:
 *   max_cell_size
 *	 chunk_size
 *	 ca [0 OR initialized]
 */
void mpool_init(struct mpool* mp)
{
	int i, nFixed;
	struct chunk_allocator* al;
	
	assert(mp->max_cell_size < mp->chunk_size);

	if (NULL == mp->chunk_alloc)
		al = mp->ca = &fal;
	else {
		al = mp->ca;
		assert(NULL != al->chunk_alloc);
		assert(NULL != al->chunk_free);
		if (NULL == al->chunk_realloc)
			al->chunk_realloc = &default_chunk_realloc;
	}
	mp->chunk_alloc = (chunk_alloc_ft)&mpool_alloc;
	mp->chunk_free = (chunk_free_ft)&mpool_free;
	mp->chunk_realloc = (chunk_realloc_ft)&default_chunk_realloc;

	mp->max_cell_size = (mp->max_cell_size + MPOOL_MIN_CELL - 1) / MPOOL_MIN_CELL * MPOOL_MIN_CELL;
	mp->chunk_size = (mp->chunk_size + MPOOL_MIN_CHUNK - 1) / MPOOL_MIN_CHUNK * MPOOL_MIN_CHUNK;
	nFixed = mp->max_cell_size / MPOOL_MIN_CELL;

	mp->fixed = (struct fixed_mpool*)al->chunk_alloc(al, sizeof(struct fixed_mpool) * nFixed);
	if (NULL == mp->fixed) {
		fprintf(stderr, "fatal: febird.mpool_init[max_cell_size=%zd, chunk_size=%zd]/n"
				, mp->max_cell_size, mp->chunk_size);
		abort();
	}

	for (i = 0; i < nFixed; ++i)
	{
		mp->fixed[i].cell_size = (i + 1) * MPOOL_MIN_CELL;
		mp->fixed[i].chunk_size = mp->chunk_size;
		mp->fixed[i].nChunks = 16;
		mp->fixed[i].ca = mp->ca;
		fixed_mpool_init(&mp->fixed[i]);
	}
#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK
	mp->big_blocks = 0;
	mp->big_list.prev = mp->big_list.next = &mp->big_list;
#endif
}

void mpool_destroy(struct mpool* mp)
{
	size_t i, nFixed;
#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK
	struct big_block_header *p;
	for (i = 0, p = mp->big_list.next; p != &mp->big_list; ++i)
	{
		struct big_block_header *q;
		if (i > mp->big_blocks) {
			fprintf(stderr, "fatal: febird.mpool_destroy/n");
			abort();
		}
		q = p->next;
		free(p);
		p = q;
	}
	assert(i == mp->big_blocks);
#endif
	nFixed = mp->max_cell_size / MPOOL_MIN_CELL;
	for (i = 0; i < nFixed; ++i)
	{
		fixed_mpool_destroy(&mp->fixed[i]);
	}
	mp->ca->chunk_free(mp->ca, mp->fixed, sizeof(struct fixed_mpool) * nFixed);
}

void* mpool_alloc(struct mpool* mp, size_t size)
{
	size_t idx;
	struct fixed_mpool* mpf;
	struct mpool_cell* cell;

#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK
	if (size > mp->max_cell_size) {
		// this is rare case
		struct big_block_header *p, *h;
		p = (struct big_block_header*)malloc(sizeof(struct big_block_header) + size);
		if (p) {
			h = &mp->big_list;
			p->prev = h;
			p->next = h->next;
			h->next->prev = p;
			h->next = p;
			mp->big_blocks++;
			return (p + 1);
		} else
			return NULL;
	}
#else
	assert(size <= mp->max_cell_size);
#endif
	assert(size > 0);

	idx = (size - 1) / MPOOL_MIN_CELL;
	mpf = &mp->fixed[idx];

// same as fixed_mpool_alloc....	
	if (NULL == mpf->head) {
		// in most case it will not run this path
		
		if (fixed_mpool_add_chunk(mpf) != 0)
			return NULL;
	}
	cell = mpf->head;
	mpf->used_cells++;
	mpf->head = mpf->head->next;
	return cell;
}

void mpool_free(struct mpool* mp, void* ptr, size_t size)
{
	size_t idx;
	struct fixed_mpool* mpf;
	struct mpool_cell* cell;

#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK
	if (size > mp->max_cell_size) {
		// this is rare case
		struct big_block_header* bbh = (struct big_block_header*)ptr - 1;
		bbh->prev->next = bbh->next;
		bbh->next->prev = bbh->prev;
		free(bbh);
		mp->big_blocks--;
		return;
	}
#else
	assert(size <= mp->max_cell_size);
#endif
	assert(size > 0);

	idx = (size - 1) / MPOOL_MIN_CELL;
	mpf = &mp->fixed[idx];

// same as fixed_mpool_free...
	cell = (struct mpool_cell*)ptr;
	cell->next = mpf->head;
	mpf->used_cells--;
	mpf->head = cell;
}

size_t mpool_used_cells(const struct mpool* mp)
{
	size_t i, n = mp->max_cell_size / MPOOL_MIN_CELL;
	size_t used = 0;
	for (i = 0; i < n; ++i)
		used += mp->fixed[i].used_cells;
	return used;
}

size_t mpool_used_bytes(const struct mpool* mp)
{
	size_t i, n = mp->max_cell_size / MPOOL_MIN_CELL;
	size_t used = 0;
	for (i = 0; i < n; ++i)
		used += mp->fixed[i].cell_size * mp->fixed[i].used_cells;
	return used;
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

#include "stdafx.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <assert.h>

#include "mpool.h"

/* must be power of 2 and greater than sizeof(void*) */

#define MPOOL_MIN_CELL 8

#define MPOOL_MIN_CHUNK 256

struct mpool_cell

{

struct mpool_cell* next;

};

struct mpool_chunk

{

struct mpool_cell* cell; // cell array

size_t size; // size in bytes;

};

/**********************************************************************************/

/**

* chunk_allocator use malloc/free

static void*

chunk_malloc_alloc(struct chunk_allocator* , size_t size)

{

return malloc(size);

}

static void

chunk_malloc_free(struct chunk_allocator* , void* block, size_t size)

{

return free(block);

}

static void*

chunk_malloc_realloc(struct chunk_allocator* , void* block, size_t old_size, size_t new_size)

{

return realloc(block, new_size);

}

/**********************************************************************************/

void*

default_chunk_realloc(struct chunk_allocator* ca, void* ptr, size_t old_size, size_t new_size)

{

void* q = ca->chunk_alloc(ca, new_size);

assert(old_size > 0);

assert(new_size > 0);

if (NULL == q) return NULL;

memcpy(q, ptr, old_size < new_size ? old_size : new_size);

ca->chunk_free(ca, ptr, old_size);

return q;

}

static void

mpool_chunk_init(struct mpool_cell* cell, size_t cell_count, size_t cell_size)

{

size_t i;

struct mpool_cell* p = cell;

assert(cell_size % MPOOL_MIN_CELL == 0);

for (i = 0; i < cell_count-1; ++i)

p = p->next = (struct mpool_cell*)((char*)p + cell_size);

p->next = NULL;

}

/**********************************************************************************/

static struct chunk_allocator fal = {

&chunk_malloc_alloc,

&chunk_malloc_realloc,

&chunk_malloc_free

};

/**********************************************************************************/

/**

* require initialized fields:

* cell_size

* chunk_size

* ca [0 OR initialized]

void fixed_mpool_init(struct fixed_mpool* mpf)

{

struct chunk_allocator* al;

if (NULL == mpf->ca)

al = mpf->ca = &fal;

else {

al = mpf->ca;

assert(NULL != al->chunk_alloc);

assert(NULL != al->chunk_free);

if (NULL == al->chunk_realloc)

al->chunk_realloc = &default_chunk_realloc;

}

assert(mpf->chunk_size > 0);

assert(mpf->cell_size > 0);

assert(mpf->cell_size < mpf->chunk_size);

mpf->cell_size = (mpf->cell_size + MPOOL_MIN_CELL - 1) / MPOOL_MIN_CELL * MPOOL_MIN_CELL;

mpf->chunk_size = (mpf->chunk_size + MPOOL_MIN_CHUNK - 1) / MPOOL_MIN_CHUNK * MPOOL_MIN_CHUNK;

mpf->head = NULL;

if (mpf->nChunks < MPOOL_MIN_CHUNK/sizeof(struct mpool_chunk))

mpf->nChunks = MPOOL_MIN_CHUNK/sizeof(struct mpool_chunk);

mpf->iNextChunk = 0;

mpf->pChunks = (struct mpool_chunk*)

al->chunk_alloc(al, sizeof(struct mpool_chunk) * mpf->nChunks);

if (NULL == mpf->pChunks) {

fprintf(stderr

, "fatal: febird.fixed_mpool_init failed, chunk[size=%zd, count=%zd]/n"

, mpf->chunk_size, mpf->nChunks);

abort();

}

mpf->used_cells = 0;

}

void fixed_mpool_destroy(struct fixed_mpool* mpf)

{

struct chunk_allocator* ca = mpf->ca;

long i;

for (i = mpf->iNextChunk - 1; i >= 0; --i)

ca->chunk_free(ca, mpf->pChunks[i].cell, mpf->pChunks[i].size);

ca->chunk_free(ca, mpf->pChunks, sizeof(struct mpool_chunk) * mpf->nChunks);

}

// 0 success, -1 fail

int fixed_mpool_add_chunk(struct fixed_mpool* mpf)

{

struct mpool_cell* cell;

assert(mpf->pChunks != NULL);

if (mpf->iNextChunk == mpf->nChunks) {

size_t old_size = sizeof(struct mpool_chunk) * mpf->nChunks;

size_t new_size = 2 * old_size;

// allocate new chunk array

struct mpool_chunk* c = (struct mpool_chunk*)

mpf->ca->chunk_realloc(mpf->ca, mpf->pChunks, old_size, new_size);

if (NULL == c) return -1;

mpf->pChunks = c;

mpf->nChunks *= 2; // chunk array expanded 2

mpf->chunk_size *= 2; // chunk_size expanded 2 also

}

// allocate a new cell array

cell = (struct mpool_cell*)mpf->ca->chunk_alloc(mpf->ca, mpf->chunk_size);

if (NULL == cell) return -1;

mpf->pChunks[mpf->iNextChunk].cell = cell;

mpf->pChunks[mpf->iNextChunk].size = mpf->chunk_size;

mpf->iNextChunk++;

mpool_chunk_init(cell, mpf->chunk_size / mpf->cell_size, mpf->cell_size);

mpf->head = cell;

return 0;

}

void* fixed_mpool_alloc(struct fixed_mpool* mpf)

{

struct mpool_cell* cell;

if (NULL == mpf->head) {

// in most case it will not run this path

if (fixed_mpool_add_chunk(mpf) != 0)

return NULL;

}

cell = mpf->head;

mpf->used_cells++;

mpf->head = mpf->head->next;

return cell;

}

void fixed_mpool_free(struct fixed_mpool* mpf, void* ptr)

{

struct mpool_cell* cell = (struct mpool_cell*)ptr;

cell->next = mpf->head;

mpf->used_cells--;

mpf->head = cell;

}

/**********************************************************************************/

/**

* require initialized fields:

* max_cell_size

* chunk_size

* ca [0 OR initialized]

void mpool_init(struct mpool* mp)

{

int i, nFixed;

struct chunk_allocator* al;

assert(mp->max_cell_size < mp->chunk_size);

if (NULL == mp->chunk_alloc)

al = mp->ca = &fal;

else {

al = mp->ca;

assert(NULL != al->chunk_alloc);

assert(NULL != al->chunk_free);

if (NULL == al->chunk_realloc)

al->chunk_realloc = &default_chunk_realloc;

}

mp->chunk_alloc = (chunk_alloc_ft)&mpool_alloc;

mp->chunk_free = (chunk_free_ft)&mpool_free;

mp->chunk_realloc = (chunk_realloc_ft)&default_chunk_realloc;

mp->max_cell_size = (mp->max_cell_size + MPOOL_MIN_CELL - 1) / MPOOL_MIN_CELL * MPOOL_MIN_CELL;

mp->chunk_size = (mp->chunk_size + MPOOL_MIN_CHUNK - 1) / MPOOL_MIN_CHUNK * MPOOL_MIN_CHUNK;

nFixed = mp->max_cell_size / MPOOL_MIN_CELL;

mp->fixed = (struct fixed_mpool*)al->chunk_alloc(al, sizeof(struct fixed_mpool) * nFixed);

if (NULL == mp->fixed) {

fprintf(stderr, "fatal: febird.mpool_init[max_cell_size=%zd, chunk_size=%zd]/n"

, mp->max_cell_size, mp->chunk_size);

abort();

}

for (i = 0; i < nFixed; ++i)

{

mp->fixed[i].cell_size = (i + 1) * MPOOL_MIN_CELL;

mp->fixed[i].chunk_size = mp->chunk_size;

mp->fixed[i].nChunks = 16;

mp->fixed[i].ca = mp->ca;

fixed_mpool_init(&mp->fixed[i]);

}

#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK

mp->big_blocks = 0;

mp->big_list.prev = mp->big_list.next = &mp->big_list;

#endif

}

void mpool_destroy(struct mpool* mp)

{

size_t i, nFixed;

#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK

struct big_block_header *p;

for (i = 0, p = mp->big_list.next; p != &mp->big_list; ++i)

{

struct big_block_header *q;

if (i > mp->big_blocks) {

fprintf(stderr, "fatal: febird.mpool_destroy/n");

abort();

}

q = p->next;

free(p);

p = q;

}

assert(i == mp->big_blocks);

#endif

nFixed = mp->max_cell_size / MPOOL_MIN_CELL;

for (i = 0; i < nFixed; ++i)

{

fixed_mpool_destroy(&mp->fixed[i]);

}

mp->ca->chunk_free(mp->ca, mp->fixed, sizeof(struct fixed_mpool) * nFixed);

}

void* mpool_alloc(struct mpool* mp, size_t size)

{

size_t idx;

struct fixed_mpool* mpf;

struct mpool_cell* cell;

#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK

if (size > mp->max_cell_size) {

// this is rare case

struct big_block_header *p, *h;

p = (struct big_block_header*)malloc(sizeof(struct big_block_header) + size);

if (p) {

h = &mp->big_list;

p->prev = h;

p->next = h->next;

h->next->prev = p;

h->next = p;

mp->big_blocks++;

return (p + 1);

} else

return NULL;

}

#else

assert(size <= mp->max_cell_size);

#endif

assert(size > 0);

idx = (size - 1) / MPOOL_MIN_CELL;

mpf = &mp->fixed[idx];

// same as fixed_mpool_alloc....

if (NULL == mpf->head) {

// in most case it will not run this path

if (fixed_mpool_add_chunk(mpf) != 0)

return NULL;

}

cell = mpf->head;

mpf->used_cells++;

mpf->head = mpf->head->next;

return cell;

}

void mpool_free(struct mpool* mp, void* ptr, size_t size)

{

size_t idx;

struct fixed_mpool* mpf;

struct mpool_cell* cell;

#ifdef FEBIRD_MPOOL_ALLOW_BIG_BLOCK

if (size > mp->max_cell_size) {

// this is rare case

struct big_block_header* bbh = (struct big_block_header*)ptr - 1;

bbh->prev->next = bbh->next;

bbh->next->prev = bbh->prev;

free(bbh);

mp->big_blocks--;

return;

}

#else

assert(size <= mp->max_cell_size);

#endif

assert(size > 0);

idx = (size - 1) / MPOOL_MIN_CELL;

mpf = &mp->fixed[idx];

// same as fixed_mpool_free...

cell = (struct mpool_cell*)ptr;

cell->next = mpf->head;

mpf->used_cells--;

mpf->head = cell;

}

size_t mpool_used_cells(const struct mpool* mp)

{

size_t i, n = mp->max_cell_size / MPOOL_MIN_CELL;

size_t used = 0;

for (i = 0; i < n; ++i)

used += mp->fixed[i].used_cells;

return used;

}

size_t mpool_used_bytes(const struct mpool* mp)

{

size_t i, n = mp->max_cell_size / MPOOL_MIN_CELL;

size_t used = 0;

for (i = 0; i < n; ++i)

used += mp->fixed[i].cell_size * mp->fixed[i].used_cells;

return used;

}

memory pool 的高效实现

作者: rockeet 发表日期: 2009年10月23日分类: C++ 评论: 0 条阅读次数: 2,805 次

memory pool

malloc可以分配任意大小的内存，因此，在malloc内部，保存了一些簿记信息（至少有一个包含内存块尺寸的信息）。调用free时，可以正确释放。

为了减少这些簿记开销，可以使用memory pool。

根据使用情境，可以分为两种：

1. 只分配固定大小的内存块，速度最快（normal path约10条机器指令）。

2. 可分配不同大小的内存块，速度稍慢，但比malloc快得多，也无簿记开销。

以下将分别说明

mpool

mpool可分配不同尺寸的内存。大多数时刻，都在内部分配。

销毁mpool时，会自动释放在mpool中未释放的内存。

mpool内部有一个包含多个不同尺寸fixed_mpool的array，根据请求分配的内存大小，直接索引到相应的fixed_mpool来分配一个单元（cell）。

通过定义宏FEBIRD_MPOOL_ALLOW_BIG_BLOCK，就允许大于max_cell_size的内存分配。在这种情况下，使用标准的malloc分配内存，分配出去的内存有额外簿记（用双向链表串起来），以便在销毁mpool时自动释放。

mpool graph

fixed_mpool

尺寸固定的内存池，一旦创建，该内存池只能分配固定尺寸的内存单元（cell）。这在很多情况下都适用，例如链表结点、树节点、图结点、自定义的结构、等等。

用于stl的map/set/list再适合不过了——但是不能用于vector/deque等需要分配可变尺寸的容器。

fixed_mpool内部的多个chunk使用数组，类似std::vector，iNextChunk相当于vector.size，nChunks相当于vector.capacity，每次空间不够时扩张一倍。使用数组，而不是链表，有以下好处：

1. 有助于对齐——如果chunk_allocator是对齐（对齐>=32时）分配的，而chunk使用链表组织，就会在chunk开始处预留一个chunk头部，这会导致不对齐。

2. 如果cell_size也刚好较大且整除chunk_size，使用链表就会浪费将近一个cell（cell_size – chunk_header_size）。

3. 如果不把连接信息保存在chunk header，就需要另外分配chunk结点，而分配chunk结点又需要其它内存分配函数。

空闲表使用单链表，因此，理论上每个cell最小必须能容纳一个指针，32位系统式4字节，64位系统式8字节，实现中使用8字节作为最小值。

世界上应用最广泛的虚拟机是啥？

作者: rockeet 发表日期: 2009年10月18日分类: 杂谈评论: 1 条阅读次数: 2,782 次

别说是JavaVM！

正确答案：x86vm

x86本身是一个硬件vm，它的指令系统是一个vm指令系统，通过翻译层后，才交给下面的risk内核。

malloc/free 的开销，如何去掉这种开销？

作者: rockeet 发表日期: 2009年10月18日分类: C++ 评论: 0 条阅读次数: 3,001 次

一般的malloc实现，对一块已分配的内存，都有两个机器字的簿记，甚至更多。如果不需要排错，理论上讲，只需要一个字长的额外开销，用来记录这块内存的尺寸（放在intptr[-1]处是个好主意）。

为什么需要这个开销呢？因为free传入的只是个指针，它不知道要释放多大的内存，因此free内部必须通过某种方式来获得这块内存的尺寸。

可以想象，如果用 malloc/free 来作为一个关联数组（map）的分配器，要浪费不少内存。不过好在实际数据的尺寸往往比额外消耗要大很多，相比起来，浪费的比例不算很大，况且现在内存还很便宜。

其实，打造一个高效的分配器并不难，难的是它的适用范围（多线程？cell尺寸，chunk尺寸，对齐，排错…），如果可以忍受这些缺陷，或者说是限制，还是比较值得的。下一步就是它的灵活性——让它可以更加容易集成进其它系统。

对于C标准库，如果能增加一个/一族这样的分配器，还是很有价值的。从理论上讲，只要free时多传一个size参数，就可以完全去掉额外的开销。这样两个函数就可以做到：

void* salloc(size_t size);
void  sfree(void* ptr, size_t size);

1 2	void* salloc(size_t size); void sfree(void* ptr, size_t size);

这样做还有一个额外的好处，就是可以更好地对齐，假定程序需要按32字节对齐，malloc/free 就至少需要32字节做簿记，如果再加上内存越界检测，就需要64字节。salloc/sfree则只需要将分配的内存对齐到32字节边界即可。

但是这对程序的正确性要求很高，malloc/free中，内存越界检测可以很容易实现，而salloc/sfree就完全做不到（除非增加额外簿记）。一个好主意是可以在debug版中加入这些差错功能，而在release版中去掉。

更好（确切地讲应该是更灵活）的方案是，实现一个

struct mpool{
  // .....
};

// return success or fail
int mpool_init(struct mpool* pool);
void mpool_destroy(struct mpool* pool);
void* mpalloc(struct mpool* pool, size_t size);
void  mpfree(struct mpool* pool, void* ptr, size_t size);

struct mpool{

// .....

};

// return success or fail

int mpool_init(struct mpool* pool);

void mpool_destroy(struct mpool* pool);

void* mpalloc(struct mpool* pool, size_t size);

void mpfree(struct mpool* pool, void* ptr, size_t size);

而让 salloc/sfree简单地作为 mpool 的包装。

gcc的std::allocator基本上是按这样的方式实现的，只不过，它的size参数，大多数时刻是自动传递的（知道具体的class/struct，也就知道它的尺寸）。实现方式上，使用 size_aligned/align 作为索引去访问特定尺寸的mempool，一个 mempool 是多个链表串起来的大chunk，每个chunk内部是链表穿起来的cell。这也许是最好的实现方式了，除了节省的额外空间开销，时间开销上，如果不考虑加锁，一次alloc平均可以在10时钟周期内完成，dealloc用的时间更短。相比之下malloc/free耗的时间也要多得多。

可变长度数据结构

作者: rockeet 发表日期: 2009年09月27日分类: C++ 评论: 1 条阅读次数: 3,380 次

固定长度的数据结构很简单，大家每天都在用。

可变长度数据结构，都可以通过内嵌对象的形式，转化成固定长度的数据结构，大家每天也都在用，例如：

struct person
{
    int    id;
    string name;
    string address;
};

struct person

{

int id;

string name;

string address;

};

每个 person 对象的长度是固定的，但是，其内嵌的 name 和 address 是变长的。从而，整个对象占据的总空间也是变长的。

但是，将这样的的对象平坦化，使之只占据一块连续空间，使用的人很少，因为在绝大多数情况下，很少有人思考这个问题，并且，大多数问题已经使用内嵌数据结构解决掉了。

然而，如果内存很紧张，或者需要处理得数据量非常大，这种方式浪费的内存就太多了。假定我们现在创建了一个 map<int, person> ，在gcc4.1 的 64位环境中，按8字节对齐，这个 map 总共会占用多少内存呢？

sizeof(map) = 48
sizeof(string) = 8
sizeof(person) = 24
sizeof(person_node) = alignto(24 + 32(rbtree node overhead), 8) = 56
sizeof(string_node) = 8*3(refcount, size_endptr, capacity_endptr)
memsizeof(string) = sizeof(string_node) + alignto(strlen + 1, 8)
memsizeof(person_node) = sizeof(person_node) + memsizeof(name) + memsizeof(address)

如果 avg(name) = 10, avg(address) = 20

实际占用的空间，大约还要再加上4：avg(name) = 14, avg(address) = 24

那么 memsizeof(person_node) = 56 + 24*2 + 14 + 24 = 142

那么该map 实际占用空间（gcc 的每个 map 还有一个虚结点:32个字节）：

48+32+n*142 = 80+n*142

如果使用一种理想的变长数据结构，再加上红黑树的优化(none virtual node, compressed color, no parentptr)，需要多少内存呢？

sizeof(map) = 16
memsizeof(person_node) = alignto(16(treenode) + 4(id) + 2*1(strlen) + 10(name) + 20(address) = 52, 8) = 56

memsizeof(map) = 16 + 56*n

内存一下节省了 60% 还多，也就是说，如果内存大小已经固定，可以装入 2.5 倍的数据。

如果用来做集群缓存，可以节省50%的机器（系统也要占一些内存）。
如果有5.6G的内存可用，就可以装入1亿条数据。
并且，在节省了60%内存的同时，还有另外一种好处：提高cache命中率，如果3个字段访问的频率相同，cpu的 cache miss 会降低3倍。
可以预见，因为cache miss降低，map.find 速度会大幅提高。

这个可变数据结构可以这样设计：

struct person2
{
    int id;
    unsigned char nName, nAddress;
    char data[1];

    // not terminated with '/0'
    const char* getName()    const { return data; }
    const char* getAddress() const { return data + nName; }
    int getSize() const { return offsetof(person, data) + nName + nAddress; }
};

struct person2

{

int id;

unsigned char nName, nAddress;

char data[1];

// not terminated with '/0'

const char* getName() const { return data; }

const char* getAddress() const { return data + nName; }

int getSize() const { return offsetof(person, data) + nName + nAddress; }

};

更复杂的情况：

如果nName 和 nAddress 要大于 255 呢？——把 unsigned char 改成 unsigned short, 甚至 unsigned int
如果person的字段很多，例如有20个字段：

struct person3
{
    int id;
    string name;
    string address;
    string ex1;
    string ex2;
    string ex3;
    string ex4;
    string ex5;
//  more...

//  var data structure:
//  unsigned char nName, nAddress, nEx1, nEx2;
//  const char* getEx1() const { return data + nName + nAddress; }
//  const char* getEx2() const { return data + nName + nAddress + nEx1; }
//  const char* getEx3() const { return data + nName + nAddress + nEx1 + nEx2; }
//  const char* getEx4() const { return data + nName + nAddress + nEx1 + nEx2 + nEx3; }
//  const char* getEx5() const { return data + nName + nAddress + nEx1 + nEx2 + nEx3 + nEx4; }
//  more...
};

struct person3

{

int id;

string name;

string address;

string ex1;

string ex2;

string ex3;

string ex4;

string ex5;

// more...

// var data structure:

// unsigned char nName, nAddress, nEx1, nEx2;

// const char* getEx1() const { return data + nName + nAddress; }

// const char* getEx2() const { return data + nName + nAddress + nEx1; }

// const char* getEx3() const { return data + nName + nAddress + nEx1 + nEx2; }

// const char* getEx4() const { return data + nName + nAddress + nEx1 + nEx2 + nEx3; }

// const char* getEx5() const { return data + nName + nAddress + nEx1 + nEx2 + nEx3 + nEx4; }

// more...

};

这就不光是写代码的复杂了，运行时字段访问的性能也成问题！性能问题可以用另外一种方式——使用直接定位——来解决。

struct person4
{
    int id;
    string name;
    string address;
    string ex1;
    string ex2;
    string ex3;
    string ex4;
    string ex5;
//  more...

//  var data structure:
//  unsigned short pAddress, pEx1, pEx2, pEx3, pEx4, pEor;
//  char data[1];
//  const char* getName() const { return data + 0; }
//  const char* getAddress() const { return data + pAddress; }
//  const char* getEx1() const { return data + pEx1; }
//  const char* getEx2() const { return data + pEx2; }
//  const char* getEx3() const { return data + pEx3; }
//  const char* getEx4() const { return data + pEx4; }
//  const char* getEx5() const { return data + pEx5; }
//  more...
//  int sizeEx1() const { return pEx2 - pEx1; }
//  int sizeEx2() const { return pEx3 - pEx2; }
//  int sizeEx3() const { return pEx4 - pEx3; }
//  int sizeEx4() const { return pEx5 - pEx4; }
//  int sizeEx5() const { return pEor - pEx5; }
};

struct person4

{

int id;

string name;

string address;

string ex1;

string ex2;

string ex3;

string ex4;

string ex5;

// more...

// var data structure:

// unsigned short pAddress, pEx1, pEx2, pEx3, pEx4, pEor;

// char data[1];

// const char* getName() const { return data + 0; }

// const char* getAddress() const { return data + pAddress; }

// const char* getEx1() const { return data + pEx1; }

// const char* getEx2() const { return data + pEx2; }

// const char* getEx3() const { return data + pEx3; }

// const char* getEx4() const { return data + pEx4; }

// const char* getEx5() const { return data + pEx5; }

// more...

// int sizeEx1() const { return pEx2 - pEx1; }

// int sizeEx2() const { return pEx3 - pEx2; }

// int sizeEx3() const { return pEx4 - pEx3; }

// int sizeEx4() const { return pEx5 - pEx4; }

// int sizeEx5() const { return pEor - pEx5; }

};

如果需要变长数据的数组，怎么办？简单，offset array + data byte array，具体实现方式与 person4 类似，只不过 offset array 元素需要使用更宽的整数。

管道的境界

作者: rockeet 发表日期: 2009年07月28日分类: shell, 操作系统评论: 0 条阅读次数: 3,552 次

一直在想：如何在 Hadoop.MapReduce 中，插入一个 C 写的 HashFunction，既要高效，又要接口简洁。通过命令行实现调用显然是不行的。刚刚终于想出了：使用管道！

一个非常简单的程序，从stdin读入，写到stdout。多简单！至于效率，管道嘛，本质上就是异步的，自然是buffered&asynchronous 模式。

hash 程序

#include <stdio.h>
int hash(const char* key)
{
   int h = 234234;
   for (; *key; ++key)
      h = h << 3 ^ *key;
   return h;
}
int main(int argc, char*[] argv)
{
    char buf[256];
    while (fgets(buf, sizeof(buf), stdin) != EOF)
    {
        printf("%d/n", hash(buf));
    }
    return 0;
}

#include <stdio.h>

int hash(const char* key)

{

int h = 234234;

for (; *key; ++key)

h = h << 3 ^ *key;

return h;

}

int main(int argc, char*[] argv)

{

char buf[256];

while (fgets(buf, sizeof(buf), stdin) != EOF)

{

printf("%d/n", hash(buf));

}

return 0;

}

框架可以一边不断往管道写key，一边从中读取结果，这两个工作完全可以是异步的。对hash程序来说，如果stdin/stdout是全缓冲的，就几乎没有io的开销，因为几百几千次 fgets/printf 才会导致一次系统调用。

对frame程序也是一样的。

在 hadoop.streaming 中，hash 函数目前还必须由 java 类指定，如果使用这种方式，那就更 unix 了。

fuck淘宝，fuck原叶绿茶

作者: rockeet 发表日期: 2009年07月13日分类: 杂谈评论: 1 条阅读次数: 3,588 次

昨天，渴了，买了瓶原叶绿茶，准备扣上盖子扔瓶子时，发现上面说：

N2KOKC5ND9L 十元淘宝券兑奖09/11/30止口

跑到淘宝上看，找不着哪兑奖，后来终于发现：

http://pro.taobao.com/yuanye/yuanye_index.htm

输入号码，点击“适用10元抵价券商品区”结果是一大堆给我都不要的东西。

我不殚以最坏的恶意来揣摩这些fuckee，可是，这些fuckee却如此侮辱我的智商。而且还是如此公开、如此明目张胆的侮辱。

侮辱了大家的智商，还要浪费大家的时间，浪费大家的精力。如果买了它的东西，在你被侮辱的同时，人家还说：傻逼真乖，把钱给爷！

很基本也很诡异的fread

作者: rockeet 发表日期: 2009年07月03日分类: C++ 评论: 0 条阅读次数: 2,661 次

本网站仅单台服务器: 2核2G 99￥包年，集成兼容 MySQL 的 MyTopling 高压缩高性能数据库

本网站仅单台服务器: MyTopling 2核2G 99￥包年

当搜索词中有错别字时，搜索引擎会尝试纠错

通过相似拼音纠错

基于自动机的算法可以完美解决这个指数爆炸问题

这个算法也可以用来解决用户输入预测(智能提示)功能

基于编辑距离的纠错

创建 DFA Key 与搜索 DFA Key 的耗时包含了收集网页展示需要的信息，耗时占比90%以上！

MapReduce做了多余的事情

先说 Map

MapReduce Key Revert ——特定数据模式的负载均衡

符号、记法

memory pool 的高效实现（代码）

memory pool 的高效实现

memory pool

mpool

世界上应用最广泛的虚拟机是啥？

malloc/free 的开销，如何去掉这种开销？

可变长度数据结构

管道的境界

fuck淘宝，fuck原叶绿茶

很基本也很诡异的fread

近期文章

近期评论

文章归档

分类目录

功能

本网站仅单台服务器: 2核2G 99￥包年，集成兼容 MySQL 的 MyTopling 高压缩高性能数据库

本网站仅单台服务器: MyTopling 2核2G 99￥包年

当搜索词中有错别字时，搜索引擎会尝试纠错

通过相似拼音纠错

基于自动机的算法可以完美解决这个指数爆炸问题

这个算法也可以用来解决用户输入预测(智能提示)功能

基于编辑距离的纠错

创建 DFA Key 与 搜索 DFA Key 的 耗时 包含了 收集网页展示需要的信息，耗时占比90%以上！

先说 Map

符号、记法

memory pool

mpool

近期文章

近期评论

文章归档

分类目录

功能

创建 DFA Key 与搜索 DFA Key 的耗时包含了收集网页展示需要的信息，耗时占比90%以上！